{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999146539216524, "eval_steps": 500, "global_step": 5858, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 699.97265625, "completions/mean_terminated_length": 650.854248046875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.00017069215669539986, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7118805369961798, "kl": 0.00024771690368652344, "learning_rate": 0.0, "loss": 0.0013, "num_tokens": 221305.0, "reward": 1.7587890625, "reward_std": 0.520090639591217, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35231640934944153, "rewards/format_reward/mean": 0.70703125, "rewards/format_reward/std": 0.45601576566696167, "rewards/tag_count_reward/mean": 0.9072265625, "rewards/tag_count_reward/std": 0.17555660009384155, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 664.51171875, "completions/mean_terminated_length": 659.0863037109375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.0003413843133907997, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6703242072909699, "kl": 0.000240325927734375, "learning_rate": 3.412969283276451e-08, "loss": 0.0015, "num_tokens": 433548.0, "reward": 1.7197265625, "reward_std": 0.48479723930358887, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.76953125, "rewards/format_reward/std": 0.4219578504562378, "rewards/tag_count_reward/mean": 0.9267578125, "rewards/tag_count_reward/std": 0.14771738648414612, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1967.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 703.83203125, "completions/mean_terminated_length": 703.83203125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.0005120764700861995, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7427426520284355, "kl": 0.00020956993103027344, "learning_rate": 6.825938566552902e-08, "loss": 0.0143, "num_tokens": 652017.0, "reward": 1.666015625, "reward_std": 0.5433375239372253, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.71484375, "rewards/format_reward/std": 0.4523732364177704, "rewards/tag_count_reward/mean": 0.904296875, "rewards/tag_count_reward/std": 0.19077010452747345, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 792.36328125, "completions/mean_terminated_length": 782.4763793945312, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.0006827686267815994, "frac_reward_zero_std": 0.0, "grad_norm": 0.47271924074274324, "kl": 0.0002186298370361328, "learning_rate": 1.0238907849829352e-07, "loss": -0.004, "num_tokens": 895438.0, "reward": 1.8017578125, "reward_std": 0.5496652126312256, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 0.7890625, "rewards/format_reward/std": 0.4087733030319214, "rewards/tag_count_reward/mean": 0.9345703125, "rewards/tag_count_reward/std": 0.15135405957698822, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1469.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 669.6484375, "completions/mean_terminated_length": 669.6484375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.0008534607834769992, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7270683268395404, "kl": 0.0002110004425048828, "learning_rate": 1.3651877133105803e-07, "loss": -0.0019, "num_tokens": 1107892.0, "reward": 1.779296875, "reward_std": 0.555311918258667, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.4425306022167206, "rewards/tag_count_reward/mean": 0.919921875, "rewards/tag_count_reward/std": 0.14495974779129028, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 637.5703125, "completions/mean_terminated_length": 632.0392456054688, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.001024152940172399, "frac_reward_zero_std": 0.0625, "grad_norm": 0.634611470529484, "kl": 0.0002498626708984375, "learning_rate": 1.7064846416382255e-07, "loss": 0.0267, "num_tokens": 1317590.0, "reward": 1.8046875, "reward_std": 0.5386633276939392, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.78515625, "rewards/format_reward/std": 0.4115184545516968, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1684342622756958, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1929.0, "completions/mean_length": 769.51171875, "completions/mean_terminated_length": 738.8280639648438, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.0011948450968677989, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4542928450433766, "kl": 0.00022101402282714844, "learning_rate": 2.0477815699658704e-07, "loss": 0.0558, "num_tokens": 1561657.0, "reward": 1.9052734375, "reward_std": 0.5527794361114502, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3483152687549591, "rewards/tag_count_reward/mean": 0.9208984375, "rewards/tag_count_reward/std": 0.23187804222106934, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 774.96875, "completions/mean_terminated_length": 733.9031982421875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.0013655372535631989, "frac_reward_zero_std": 0.0, "grad_norm": 0.6497880991690848, "kl": 0.0002589225769042969, "learning_rate": 2.3890784982935155e-07, "loss": 0.0277, "num_tokens": 1799009.0, "reward": 1.4560546875, "reward_std": 0.5617755651473999, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.59765625, "rewards/format_reward/std": 0.4913311004638672, "rewards/tag_count_reward/mean": 0.8505859375, "rewards/tag_count_reward/std": 0.2298542857170105, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1752.0, "completions/max_terminated_length": 1752.0, "completions/mean_length": 706.33203125, "completions/mean_terminated_length": 706.33203125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.0015362294102585986, "frac_reward_zero_std": 0.0, "grad_norm": 0.6278756842397455, "kl": 0.0002110004425048828, "learning_rate": 2.7303754266211607e-07, "loss": -0.0104, "num_tokens": 2022086.0, "reward": 1.7626953125, "reward_std": 0.5346428751945496, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4338609278202057, "rewards/tag_count_reward/mean": 0.9228515625, "rewards/tag_count_reward/std": 0.15228737890720367, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 677.95703125, "completions/mean_terminated_length": 650.6653442382812, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.0017069215669539984, "frac_reward_zero_std": 0.0625, "grad_norm": 0.581058222054077, "kl": 0.0005135536193847656, "learning_rate": 3.071672354948806e-07, "loss": 0.0126, "num_tokens": 2240523.0, "reward": 1.791015625, "reward_std": 0.5035039186477661, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24256734549999237, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40311288833618164, "rewards/tag_count_reward/mean": 0.935546875, "rewards/tag_count_reward/std": 0.14601267874240875, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 646.09765625, "completions/mean_terminated_length": 623.8452758789062, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.0018776137236493984, "frac_reward_zero_std": 0.0, "grad_norm": 0.5906895591699449, "kl": 0.0005769729614257812, "learning_rate": 3.412969283276451e-07, "loss": 0.0239, "num_tokens": 2455876.0, "reward": 1.755859375, "reward_std": 0.526915431022644, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31272050738334656, "rewards/format_reward/mean": 0.73046875, "rewards/format_reward/std": 0.44458550214767456, "rewards/tag_count_reward/mean": 0.916015625, "rewards/tag_count_reward/std": 0.16202561557292938, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 677.19921875, "completions/mean_terminated_length": 649.8924560546875, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.002048305880344798, "frac_reward_zero_std": 0.25, "grad_norm": 0.4144687108739731, "kl": 0.0003895759582519531, "learning_rate": 3.754266211604096e-07, "loss": 0.0202, "num_tokens": 2674343.0, "reward": 1.9619140625, "reward_std": 0.4173893928527832, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.3562295734882355, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3483152687549591, "rewards/tag_count_reward/mean": 0.9541015625, "rewards/tag_count_reward/std": 0.13318413496017456, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 779.38671875, "completions/mean_terminated_length": 759.2500610351562, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.002218998037040198, "frac_reward_zero_std": 0.25, "grad_norm": 0.31456281573296646, "kl": 0.00033855438232421875, "learning_rate": 4.0955631399317407e-07, "loss": 0.0261, "num_tokens": 2911386.0, "reward": 1.9208984375, "reward_std": 0.38588908314704895, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33136674761772156, "rewards/tag_count_reward/mean": 0.9599609375, "rewards/tag_count_reward/std": 0.12170520424842834, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1938.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 652.36328125, "completions/mean_terminated_length": 652.36328125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.0023896901937355977, "frac_reward_zero_std": 0.375, "grad_norm": 0.3671543194766767, "kl": 0.0008182525634765625, "learning_rate": 4.436860068259386e-07, "loss": 0.0021, "num_tokens": 3118887.0, "reward": 1.8828125, "reward_std": 0.2637982666492462, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.1564512401819229, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.31272050738334656, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09393364936113358, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1885.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 659.18359375, "completions/mean_terminated_length": 659.18359375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.0025603823504309975, "frac_reward_zero_std": 0.1875, "grad_norm": 0.4272070482722128, "kl": 0.0010633468627929688, "learning_rate": 4.778156996587031e-07, "loss": -0.0053, "num_tokens": 3330166.0, "reward": 2.0068359375, "reward_std": 0.33863478899002075, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.3222736418247223, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.28082075715065, "rewards/tag_count_reward/mean": 0.9755859375, "rewards/tag_count_reward/std": 0.09203393012285233, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1506.0, "completions/max_terminated_length": 1506.0, "completions/mean_length": 665.77734375, "completions/mean_terminated_length": 665.77734375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.0027310745071263977, "frac_reward_zero_std": 0.375, "grad_norm": 0.30718621226176224, "kl": 0.0005998611450195312, "learning_rate": 5.119453924914676e-07, "loss": 0.0189, "num_tokens": 3540413.0, "reward": 2.0322265625, "reward_std": 0.2934136986732483, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3077581524848938, "rewards/format_reward/mean": 0.94921875, "rewards/format_reward/std": 0.21998079121112823, "rewards/tag_count_reward/mean": 0.9775390625, "rewards/tag_count_reward/std": 0.12018531560897827, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 643.20703125, "completions/mean_terminated_length": 643.20703125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.0029017666638217975, "frac_reward_zero_std": 0.375, "grad_norm": 0.3039558973372514, "kl": 0.0005950927734375, "learning_rate": 5.460750853242321e-07, "loss": 0.0146, "num_tokens": 3744530.0, "reward": 2.15625, "reward_std": 0.288375586271286, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.048884619027376175, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1617.0, "completions/max_terminated_length": 1617.0, "completions/mean_length": 626.16015625, "completions/mean_terminated_length": 626.16015625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.0030724588205171973, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2560363451855982, "kl": 0.0026979446411132812, "learning_rate": 5.802047781569966e-07, "loss": 0.0053, "num_tokens": 3944795.0, "reward": 2.015625, "reward_std": 0.12598475813865662, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1685.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 751.14453125, "completions/mean_terminated_length": 751.14453125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.003243150977212597, "frac_reward_zero_std": 0.625, "grad_norm": 0.23078944554079506, "kl": 0.0008916854858398438, "learning_rate": 6.143344709897612e-07, "loss": 0.0019, "num_tokens": 4176544.0, "reward": 2.078125, "reward_std": 0.1418628990650177, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 727.765625, "completions/mean_terminated_length": 712.1107177734375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.003413843133907997, "frac_reward_zero_std": 0.6875, "grad_norm": 0.20047409277183043, "kl": 0.00125885009765625, "learning_rate": 6.484641638225256e-07, "loss": 0.0359, "num_tokens": 4400964.0, "reward": 1.990234375, "reward_std": 0.12505322694778442, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.08226180076599121, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 752.28515625, "completions/mean_terminated_length": 742.0827026367188, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.0035845352906033966, "frac_reward_zero_std": 0.4375, "grad_norm": 0.22612500625303025, "kl": 0.00135040283203125, "learning_rate": 6.825938566552902e-07, "loss": 0.0178, "num_tokens": 4632989.0, "reward": 2.0830078125, "reward_std": 0.23076124489307404, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06789661198854446, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 768.875, "completions/mean_terminated_length": 753.70751953125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.003755227447298797, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1449391014310713, "kl": 0.0023136138916015625, "learning_rate": 7.167235494880546e-07, "loss": 0.0269, "num_tokens": 4871597.0, "reward": 2.0966796875, "reward_std": 0.15673679113388062, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.09693823754787445, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 754.0234375, "completions/mean_terminated_length": 743.8346557617188, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.003925919603994197, "frac_reward_zero_std": 0.5, "grad_norm": 0.26592671647711374, "kl": 0.001316070556640625, "learning_rate": 7.508532423208192e-07, "loss": 0.0535, "num_tokens": 5103379.0, "reward": 2.0791015625, "reward_std": 0.20352666079998016, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.080633744597435, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 698.9375, "completions/mean_terminated_length": 693.6470947265625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.004096611760689596, "frac_reward_zero_std": 0.5625, "grad_norm": 0.27860682742202103, "kl": 0.00174713134765625, "learning_rate": 7.849829351535837e-07, "loss": 0.021, "num_tokens": 5320563.0, "reward": 2.001953125, "reward_std": 0.14553415775299072, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.0539139099419117, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 690.49609375, "completions/mean_terminated_length": 674.3992309570312, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.004267303917384996, "frac_reward_zero_std": 0.6875, "grad_norm": 0.18791139747449018, "kl": 0.001979827880859375, "learning_rate": 8.191126279863481e-07, "loss": 0.0604, "num_tokens": 5536098.0, "reward": 2.0380859375, "reward_std": 0.13611388206481934, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.0808708667755127, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 634.5, "completions/mean_terminated_length": 628.9569091796875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.004437996074080396, "frac_reward_zero_std": 0.4375, "grad_norm": 0.29260593439301336, "kl": 0.0021762847900390625, "learning_rate": 8.532423208191128e-07, "loss": 0.0188, "num_tokens": 5741202.0, "reward": 2.1572265625, "reward_std": 0.21947821974754333, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.3710577189922333, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1813.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 727.77734375, "completions/mean_terminated_length": 727.77734375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.004608688230775796, "frac_reward_zero_std": 0.75, "grad_norm": 0.1787849683291201, "kl": 0.00202178955078125, "learning_rate": 8.873720136518772e-07, "loss": 0.0056, "num_tokens": 5970841.0, "reward": 2.0888671875, "reward_std": 0.12119126319885254, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.2920515835285187, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 753.90625, "completions/mean_terminated_length": 748.8314208984375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.0047793803874711955, "frac_reward_zero_std": 0.75, "grad_norm": 0.16665948339393616, "kl": 0.002040863037109375, "learning_rate": 9.215017064846417e-07, "loss": 0.0136, "num_tokens": 6203409.0, "reward": 1.9921875, "reward_std": 0.08384781330823898, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04935242608189583, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 804.4609375, "completions/mean_terminated_length": 794.6693115234375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.004950072544166595, "frac_reward_zero_std": 0.3125, "grad_norm": 0.2816933944163354, "kl": 0.0019855499267578125, "learning_rate": 9.556313993174062e-07, "loss": 0.0416, "num_tokens": 6453383.0, "reward": 2.1572265625, "reward_std": 0.2921920418739319, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.38467901945114136, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.0864996686577797, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 663.546875, "completions/mean_terminated_length": 635.9681396484375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.005120764700861995, "frac_reward_zero_std": 0.4375, "grad_norm": 0.28529676126726433, "kl": 0.003391265869140625, "learning_rate": 9.897610921501708e-07, "loss": 0.023, "num_tokens": 6662003.0, "reward": 2.1416015625, "reward_std": 0.29597947001457214, "rewards/accuracy_reward/mean": 0.17578125, "rewards/accuracy_reward/std": 0.3813795745372772, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9853515625, "rewards/tag_count_reward/std": 0.1039903536438942, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1371.0, "completions/max_terminated_length": 1371.0, "completions/mean_length": 675.859375, "completions/mean_terminated_length": 675.859375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.005291456857557395, "frac_reward_zero_std": 0.4375, "grad_norm": 0.23698061319569816, "kl": 0.0031280517578125, "learning_rate": 1.0238907849829352e-06, "loss": -0.0113, "num_tokens": 6875183.0, "reward": 2.1875, "reward_std": 0.23340418934822083, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 659.34765625, "completions/mean_terminated_length": 648.4133911132812, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.0054621490142527955, "frac_reward_zero_std": 0.5, "grad_norm": 0.2615733972674681, "kl": 0.0043182373046875, "learning_rate": 1.0580204778156999e-06, "loss": 0.0346, "num_tokens": 7084616.0, "reward": 2.1396484375, "reward_std": 0.2041168212890625, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.3600577116012573, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05169277638196945, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1924.0, "completions/mean_length": 769.97265625, "completions/mean_terminated_length": 764.9608154296875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.005632841170948195, "frac_reward_zero_std": 0.4375, "grad_norm": 0.25476877461403247, "kl": 0.003902435302734375, "learning_rate": 1.0921501706484643e-06, "loss": -0.005, "num_tokens": 7321409.0, "reward": 2.10546875, "reward_std": 0.192440003156662, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.3222736418247223, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04935242608189583, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1801.0, "completions/max_terminated_length": 1801.0, "completions/mean_length": 714.35546875, "completions/mean_terminated_length": 714.35546875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.005803533327643595, "frac_reward_zero_std": 0.5, "grad_norm": 0.31223133943703585, "kl": 0.00614166259765625, "learning_rate": 1.1262798634812287e-06, "loss": 0.0164, "num_tokens": 7545964.0, "reward": 2.314453125, "reward_std": 0.2303830087184906, "rewards/accuracy_reward/mean": 0.32421875, "rewards/accuracy_reward/std": 0.46899911761283875, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1764.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 761.01953125, "completions/mean_terminated_length": 761.01953125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.005974225484338995, "frac_reward_zero_std": 0.5, "grad_norm": 0.2763452357358001, "kl": 0.005828857421875, "learning_rate": 1.1604095563139933e-06, "loss": -0.0056, "num_tokens": 7788225.0, "reward": 2.14453125, "reward_std": 0.18903234601020813, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35231640934944153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1479.0, "completions/max_terminated_length": 1479.0, "completions/mean_length": 729.86328125, "completions/mean_terminated_length": 729.86328125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.006144917641034395, "frac_reward_zero_std": 0.5625, "grad_norm": 0.258273162274502, "kl": 0.00701904296875, "learning_rate": 1.1945392491467577e-06, "loss": -0.0026, "num_tokens": 8016222.0, "reward": 2.0732421875, "reward_std": 0.16612111032009125, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2749498784542084, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.056234680116176605, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 795.87890625, "completions/mean_terminated_length": 786.0196533203125, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.006315609797729794, "frac_reward_zero_std": 0.625, "grad_norm": 0.1888621247606142, "kl": 0.00594329833984375, "learning_rate": 1.2286689419795223e-06, "loss": 0.0241, "num_tokens": 8261695.0, "reward": 2.115234375, "reward_std": 0.18600620329380035, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.0661611557006836, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1878.0, "completions/mean_length": 820.5078125, "completions/mean_terminated_length": 801.0238647460938, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.006486301954425194, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2232911471213029, "kl": 0.00562286376953125, "learning_rate": 1.2627986348122867e-06, "loss": 0.0299, "num_tokens": 8512993.0, "reward": 2.12890625, "reward_std": 0.20460191369056702, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.3710577189922333, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.08494158834218979, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 681.0078125, "completions/mean_terminated_length": 675.6470947265625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.006656994111120594, "frac_reward_zero_std": 0.5625, "grad_norm": 0.23655045788489482, "kl": 0.009368896484375, "learning_rate": 1.2969283276450511e-06, "loss": 0.0104, "num_tokens": 8728611.0, "reward": 2.1025390625, "reward_std": 0.1747848242521286, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3268752694129944, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06789661198854446, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1795.0, "completions/mean_length": 742.3203125, "completions/mean_terminated_length": 737.2000732421875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.006827686267815994, "frac_reward_zero_std": 0.625, "grad_norm": 0.2017777507995557, "kl": 0.007049560546875, "learning_rate": 1.331058020477816e-06, "loss": 0.0275, "num_tokens": 8960213.0, "reward": 2.1728515625, "reward_std": 0.20217061042785645, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.38467901945114136, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 760.76171875, "completions/mean_terminated_length": 755.7137451171875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.0069983784245113935, "frac_reward_zero_std": 0.3125, "grad_norm": 0.2992956075981898, "kl": 0.0072174072265625, "learning_rate": 1.3651877133105804e-06, "loss": 0.0225, "num_tokens": 9194536.0, "reward": 2.1845703125, "reward_std": 0.25445520877838135, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06789661198854446, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 707.0, "completions/mean_terminated_length": 669.3012084960938, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.007169070581206793, "frac_reward_zero_std": 0.6875, "grad_norm": 0.20760928928509118, "kl": 0.006744384765625, "learning_rate": 1.3993174061433448e-06, "loss": 0.0117, "num_tokens": 9414952.0, "reward": 2.083984375, "reward_std": 0.15776100754737854, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.34422317147254944, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.978515625, "rewards/tag_count_reward/std": 0.12338106334209442, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1852.0, "completions/mean_length": 755.26953125, "completions/mean_terminated_length": 745.8621826171875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.007339762737902193, "frac_reward_zero_std": 0.4375, "grad_norm": 0.26376586009202774, "kl": 0.007476806640625, "learning_rate": 1.4334470989761092e-06, "loss": -0.0031, "num_tokens": 9650029.0, "reward": 2.3525390625, "reward_std": 0.2393074333667755, "rewards/accuracy_reward/mean": 0.36328125, "rewards/accuracy_reward/std": 0.48188701272010803, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03488371521234512, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1980.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 746.78125, "completions/mean_terminated_length": 746.78125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.007510454894597594, "frac_reward_zero_std": 0.4375, "grad_norm": 0.27459278381581015, "kl": 0.006072998046875, "learning_rate": 1.4675767918088738e-06, "loss": 0.0244, "num_tokens": 9888021.0, "reward": 2.220703125, "reward_std": 0.21234670281410217, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.41942715644836426, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2033.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 759.1875, "completions/mean_terminated_length": 759.1875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.0076811470512929934, "frac_reward_zero_std": 0.6875, "grad_norm": 0.182606327000578, "kl": 0.006988525390625, "learning_rate": 1.5017064846416384e-06, "loss": 0.0004, "num_tokens": 10120661.0, "reward": 2.1640625, "reward_std": 0.1424899697303772, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.3710577189922333, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 748.66015625, "completions/mean_terminated_length": 722.7769165039062, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.007851839207988393, "frac_reward_zero_std": 0.375, "grad_norm": 0.3004437979213552, "kl": 0.00704193115234375, "learning_rate": 1.5358361774744028e-06, "loss": 0.0345, "num_tokens": 10351310.0, "reward": 2.21484375, "reward_std": 0.2952902913093567, "rewards/accuracy_reward/mean": 0.25390625, "rewards/accuracy_reward/std": 0.4360972046852112, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.10502100735902786, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 687.7890625, "completions/mean_terminated_length": 671.6600952148438, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.008022531364683793, "frac_reward_zero_std": 0.625, "grad_norm": 0.25241408780975055, "kl": 0.0067596435546875, "learning_rate": 1.5699658703071675e-06, "loss": 0.0375, "num_tokens": 10570648.0, "reward": 2.16796875, "reward_std": 0.18101255595684052, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.07301289588212967, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1487.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 680.8828125, "completions/mean_terminated_length": 680.8828125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.008193223521379193, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2899062321990787, "kl": 0.007354736328125, "learning_rate": 1.6040955631399319e-06, "loss": 0.0017, "num_tokens": 10786938.0, "reward": 2.005859375, "reward_std": 0.09166031330823898, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 791.75, "completions/mean_terminated_length": 781.8582763671875, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.008363915678074593, "frac_reward_zero_std": 0.625, "grad_norm": 0.21174465849851096, "kl": 0.005584716796875, "learning_rate": 1.6382252559726963e-06, "loss": 0.0153, "num_tokens": 11032586.0, "reward": 2.17578125, "reward_std": 0.18091818690299988, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04935242608189583, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 751.19140625, "completions/mean_terminated_length": 746.1058959960938, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.008534607834769992, "frac_reward_zero_std": 0.5, "grad_norm": 0.25350224796340143, "kl": 0.00661468505859375, "learning_rate": 1.6723549488054607e-06, "loss": 0.0032, "num_tokens": 11264667.0, "reward": 2.1708984375, "reward_std": 0.18332768976688385, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05169277638196945, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1277.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 516.98046875, "completions/mean_terminated_length": 516.98046875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.008705299991465392, "frac_reward_zero_std": 0.5, "grad_norm": 0.414405394906272, "kl": 0.009307861328125, "learning_rate": 1.7064846416382255e-06, "loss": -0.0037, "num_tokens": 11439446.0, "reward": 2.2021484375, "reward_std": 0.17574773728847504, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41420844197273254, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 691.26171875, "completions/mean_terminated_length": 685.9412231445312, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.008875992148160792, "frac_reward_zero_std": 0.25, "grad_norm": 0.30294526432395763, "kl": 0.00522613525390625, "learning_rate": 1.74061433447099e-06, "loss": -0.0131, "num_tokens": 11658137.0, "reward": 2.44140625, "reward_std": 0.33538830280303955, "rewards/accuracy_reward/mean": 0.4833333194255829, "rewards/accuracy_reward/std": 0.5007664561271667, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04935242608189583, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1502.0, "completions/max_terminated_length": 1502.0, "completions/mean_length": 605.38671875, "completions/mean_terminated_length": 605.38671875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.009046684304856192, "frac_reward_zero_std": 0.6875, "grad_norm": 0.23888951256066668, "kl": 0.00830841064453125, "learning_rate": 1.7747440273037543e-06, "loss": 0.016, "num_tokens": 11852332.0, "reward": 2.1953125, "reward_std": 0.11761415004730225, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1478.0, "completions/max_terminated_length": 1478.0, "completions/mean_length": 619.5546875, "completions/mean_terminated_length": 619.5546875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.009217376461551591, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2594786707177375, "kl": 0.0082855224609375, "learning_rate": 1.808873720136519e-06, "loss": 0.0169, "num_tokens": 12046634.0, "reward": 2.06640625, "reward_std": 0.1465591937303543, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04935242608189583, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1475.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 668.7265625, "completions/mean_terminated_length": 668.7265625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.009388068618246991, "frac_reward_zero_std": 0.5, "grad_norm": 0.29707304852722455, "kl": 0.00847625732421875, "learning_rate": 1.8430034129692834e-06, "loss": 0.0169, "num_tokens": 12260868.0, "reward": 2.125, "reward_std": 0.21018312871456146, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1922.0, "completions/mean_length": 656.6171875, "completions/mean_terminated_length": 651.1608276367188, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.009558760774942391, "frac_reward_zero_std": 0.4375, "grad_norm": 0.30416061112402926, "kl": 0.00766754150390625, "learning_rate": 1.8771331058020478e-06, "loss": 0.0238, "num_tokens": 12480642.0, "reward": 2.1494140625, "reward_std": 0.21428507566452026, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3734568655490875, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 752.875, "completions/mean_terminated_length": 742.6771850585938, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.00972945293163779, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2504574866986988, "kl": 0.008209228515625, "learning_rate": 1.9112627986348124e-06, "loss": 0.029, "num_tokens": 12715634.0, "reward": 2.126953125, "reward_std": 0.20189255475997925, "rewards/accuracy_reward/mean": 0.16249999403953552, "rewards/accuracy_reward/std": 0.3696798086166382, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.08226180076599121, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 672.6953125, "completions/mean_terminated_length": 672.6953125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.00990014508833319, "frac_reward_zero_std": 0.5, "grad_norm": 0.26157217744306693, "kl": 0.0089111328125, "learning_rate": 1.945392491467577e-06, "loss": 0.013, "num_tokens": 12926484.0, "reward": 2.24609375, "reward_std": 0.1803291141986847, "rewards/accuracy_reward/mean": 0.24609375, "rewards/accuracy_reward/std": 0.43157756328582764, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 780.99609375, "completions/mean_terminated_length": 776.0275268554688, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.01007083724502859, "frac_reward_zero_std": 0.5, "grad_norm": 0.23632907079458523, "kl": 0.00539398193359375, "learning_rate": 1.9795221843003416e-06, "loss": -0.005, "num_tokens": 13167891.0, "reward": 2.3095703125, "reward_std": 0.18590891361236572, "rewards/accuracy_reward/mean": 0.31640625, "rewards/accuracy_reward/std": 0.4659844934940338, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1896.0, "completions/max_terminated_length": 1896.0, "completions/mean_length": 606.234375, "completions/mean_terminated_length": 606.234375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.01024152940172399, "frac_reward_zero_std": 0.8125, "grad_norm": 0.17435902520545438, "kl": 0.00850677490234375, "learning_rate": 2.013651877133106e-06, "loss": 0.001, "num_tokens": 13367215.0, "reward": 2.21875, "reward_std": 0.07064647972583771, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41420844197273254, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1881.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 694.99609375, "completions/mean_terminated_length": 694.99609375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.01041222155841939, "frac_reward_zero_std": 0.5625, "grad_norm": 0.254663017557562, "kl": 0.0066680908203125, "learning_rate": 2.0477815699658705e-06, "loss": -0.0003, "num_tokens": 13585326.0, "reward": 2.265625, "reward_std": 0.17044082283973694, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.4425306022167206, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1744.0, "completions/max_terminated_length": 1744.0, "completions/mean_length": 642.80078125, "completions/mean_terminated_length": 642.80078125, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.01058291371511479, "frac_reward_zero_std": 0.6875, "grad_norm": 0.25829096423372583, "kl": 0.0095977783203125, "learning_rate": 2.0819112627986347e-06, "loss": 0.0044, "num_tokens": 13794107.0, "reward": 2.13671875, "reward_std": 0.14895963668823242, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.34422317147254944, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1393.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 637.96484375, "completions/mean_terminated_length": 637.96484375, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.010753605871810191, "frac_reward_zero_std": 0.6875, "grad_norm": 0.21191654842346513, "kl": 0.00829315185546875, "learning_rate": 2.1160409556313997e-06, "loss": 0.0083, "num_tokens": 13998210.0, "reward": 2.1640625, "reward_std": 0.1000204086303711, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.3710577189922333, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1489.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 654.02734375, "completions/mean_terminated_length": 654.02734375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.010924298028505591, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3026421139657038, "kl": 0.008575439453125, "learning_rate": 2.150170648464164e-06, "loss": 0.0174, "num_tokens": 14207913.0, "reward": 2.283203125, "reward_std": 0.21743682026863098, "rewards/accuracy_reward/mean": 0.29296875, "rewards/accuracy_reward/std": 0.45601576566696167, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1903.0, "completions/max_terminated_length": 1903.0, "completions/mean_length": 681.14453125, "completions/mean_terminated_length": 681.14453125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.01109499018520099, "frac_reward_zero_std": 0.6875, "grad_norm": 0.23455429415670015, "kl": 0.00821685791015625, "learning_rate": 2.1843003412969285e-06, "loss": -0.0059, "num_tokens": 14428670.0, "reward": 2.19921875, "reward_std": 0.1204879954457283, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2023.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 776.578125, "completions/mean_terminated_length": 776.578125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.01126568234189639, "frac_reward_zero_std": 0.6875, "grad_norm": 0.22202524529474887, "kl": 0.0073699951171875, "learning_rate": 2.218430034129693e-06, "loss": 0.0083, "num_tokens": 14671458.0, "reward": 2.04296875, "reward_std": 0.10574321448802948, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1965.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 646.44140625, "completions/mean_terminated_length": 646.44140625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.01143637449859179, "frac_reward_zero_std": 0.6875, "grad_norm": 0.22101949459548084, "kl": 0.00717926025390625, "learning_rate": 2.2525597269624573e-06, "loss": -0.0027, "num_tokens": 14872083.0, "reward": 2.08203125, "reward_std": 0.11046826094388962, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2749498784542084, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1440.0, "completions/max_terminated_length": 1440.0, "completions/mean_length": 649.015625, "completions/mean_terminated_length": 649.015625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.01160706665528719, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3707400443338804, "kl": 0.01198577880859375, "learning_rate": 2.286689419795222e-06, "loss": 0.0066, "num_tokens": 15081991.0, "reward": 2.25390625, "reward_std": 0.19493989646434784, "rewards/accuracy_reward/mean": 0.25390625, "rewards/accuracy_reward/std": 0.4360972046852112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1845.0, "completions/max_terminated_length": 1845.0, "completions/mean_length": 697.3671875, "completions/mean_terminated_length": 697.3671875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.01177775881198259, "frac_reward_zero_std": 0.75, "grad_norm": 0.18393437417739483, "kl": 0.0094146728515625, "learning_rate": 2.3208191126279866e-06, "loss": 0.0057, "num_tokens": 15304405.0, "reward": 2.2578125, "reward_std": 0.11664125323295593, "rewards/accuracy_reward/mean": 0.2578125, "rewards/accuracy_reward/std": 0.4382871091365814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 765.0625, "completions/mean_terminated_length": 728.9959716796875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.01194845096867799, "frac_reward_zero_std": 0.5, "grad_norm": 0.24309471242601235, "kl": 0.00585174560546875, "learning_rate": 2.354948805460751e-06, "loss": 0.0122, "num_tokens": 15544021.0, "reward": 2.173828125, "reward_std": 0.2439385950565338, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41420844197273254, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.982421875, "rewards/tag_count_reward/std": 0.10929181426763535, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1980.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 632.97265625, "completions/mean_terminated_length": 632.97265625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.01211914312537339, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2245584357907696, "kl": 0.0071868896484375, "learning_rate": 2.3890784982935154e-06, "loss": -0.0043, "num_tokens": 15748878.0, "reward": 2.19140625, "reward_std": 0.11479227244853973, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 643.125, "completions/mean_terminated_length": 643.125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.01228983528206879, "frac_reward_zero_std": 0.75, "grad_norm": 0.2456382557671944, "kl": 0.00814056396484375, "learning_rate": 2.42320819112628e-06, "loss": -0.0115, "num_tokens": 15960030.0, "reward": 2.11328125, "reward_std": 0.09947754442691803, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1732.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 710.40625, "completions/mean_terminated_length": 710.40625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.012460527438764189, "frac_reward_zero_std": 0.6875, "grad_norm": 0.21495158843292694, "kl": 0.006927490234375, "learning_rate": 2.4573378839590446e-06, "loss": -0.021, "num_tokens": 16182902.0, "reward": 2.08203125, "reward_std": 0.12349742650985718, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2749498784542084, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1714.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 764.28515625, "completions/mean_terminated_length": 764.28515625, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.012631219595459589, "frac_reward_zero_std": 0.4375, "grad_norm": 0.27543312562215216, "kl": 0.0063934326171875, "learning_rate": 2.491467576791809e-06, "loss": 0.0098, "num_tokens": 16418127.0, "reward": 2.1669921875, "reward_std": 0.2387092411518097, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 663.86328125, "completions/mean_terminated_length": 658.435302734375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.012801911752154988, "frac_reward_zero_std": 0.8125, "grad_norm": 0.15548927898474868, "kl": 0.0077972412109375, "learning_rate": 2.5255972696245735e-06, "loss": 0.0207, "num_tokens": 16634860.0, "reward": 2.0869140625, "reward_std": 0.07994156330823898, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.2920515835285187, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2043.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 680.0078125, "completions/mean_terminated_length": 680.0078125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.012972603908850388, "frac_reward_zero_std": 0.6875, "grad_norm": 0.17683256491943397, "kl": 0.010345458984375, "learning_rate": 2.559726962457338e-06, "loss": 0.0172, "num_tokens": 16848510.0, "reward": 2.146484375, "reward_std": 0.11432743072509766, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.3562295734882355, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1729.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 712.15625, "completions/mean_terminated_length": 712.15625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.013143296065545788, "frac_reward_zero_std": 0.625, "grad_norm": 0.26419435778913625, "kl": 0.008819580078125, "learning_rate": 2.5938566552901023e-06, "loss": -0.0112, "num_tokens": 17072982.0, "reward": 2.25390625, "reward_std": 0.17240957915782928, "rewards/accuracy_reward/mean": 0.25390625, "rewards/accuracy_reward/std": 0.4360972046852112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 716.83984375, "completions/mean_terminated_length": 711.61962890625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.013313988222241188, "frac_reward_zero_std": 0.6875, "grad_norm": 0.23950902735079127, "kl": 0.0089263916015625, "learning_rate": 2.6279863481228673e-06, "loss": 0.0139, "num_tokens": 17297869.0, "reward": 2.1318359375, "reward_std": 0.10722580552101135, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.34422317147254944, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1590.0, "completions/max_terminated_length": 1590.0, "completions/mean_length": 793.07421875, "completions/mean_terminated_length": 793.07421875, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.013484680378936588, "frac_reward_zero_std": 0.8125, "grad_norm": 0.14716176669226788, "kl": 0.006866455078125, "learning_rate": 2.662116040955632e-06, "loss": -0.0038, "num_tokens": 17544272.0, "reward": 2.0625, "reward_std": 0.07646197080612183, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 671.34375, "completions/mean_terminated_length": 671.34375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.013655372535631987, "frac_reward_zero_std": 0.5625, "grad_norm": 0.34217376518725384, "kl": 0.0101165771484375, "learning_rate": 2.696245733788396e-06, "loss": -0.0338, "num_tokens": 17758984.0, "reward": 2.1328125, "reward_std": 0.1737399697303772, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 691.3359375, "completions/mean_terminated_length": 686.0157470703125, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.013826064692327387, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2523881761885002, "kl": 0.008392333984375, "learning_rate": 2.7303754266211608e-06, "loss": 0.0263, "num_tokens": 17976446.0, "reward": 2.0908203125, "reward_std": 0.12283584475517273, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29743078351020813, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 706.82421875, "completions/mean_terminated_length": 701.5647583007812, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.013996756849022787, "frac_reward_zero_std": 0.5, "grad_norm": 0.232731893875513, "kl": 0.0074462890625, "learning_rate": 2.7645051194539254e-06, "loss": 0.025, "num_tokens": 18193409.0, "reward": 2.0947265625, "reward_std": 0.17302629351615906, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1369.0, "completions/max_terminated_length": 1369.0, "completions/mean_length": 583.02734375, "completions/mean_terminated_length": 583.02734375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.014167449005718187, "frac_reward_zero_std": 0.3125, "grad_norm": 0.4391655087586553, "kl": 0.012115478515625, "learning_rate": 2.7986348122866896e-06, "loss": 0.0048, "num_tokens": 18390488.0, "reward": 2.3173828125, "reward_std": 0.3140406012535095, "rewards/accuracy_reward/mean": 0.32421875, "rewards/accuracy_reward/std": 0.46899911761283875, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 688.9765625, "completions/mean_terminated_length": 688.9765625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.014338141162413586, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3117003300245406, "kl": 0.009429931640625, "learning_rate": 2.832764505119454e-06, "loss": -0.0084, "num_tokens": 18610034.0, "reward": 2.18359375, "reward_std": 0.250643253326416, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.387910932302475, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 793.73828125, "completions/mean_terminated_length": 783.8621826171875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.014508833319108986, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2381725987219416, "kl": 0.00677490234375, "learning_rate": 2.8668941979522184e-06, "loss": 0.0211, "num_tokens": 18850607.0, "reward": 2.1767578125, "reward_std": 0.16373750567436218, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06789661198854446, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1876.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 820.4453125, "completions/mean_terminated_length": 820.4453125, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.014679525475804386, "frac_reward_zero_std": 0.8125, "grad_norm": 0.14951680890224767, "kl": 0.0074920654296875, "learning_rate": 2.901023890784983e-06, "loss": -0.0067, "num_tokens": 19102497.0, "reward": 2.078125, "reward_std": 0.08636415004730225, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 740.14453125, "completions/mean_terminated_length": 735.0157470703125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.014850217632499786, "frac_reward_zero_std": 0.5, "grad_norm": 0.2670441905290679, "kl": 0.0089263916015625, "learning_rate": 2.9351535836177476e-06, "loss": 0.0197, "num_tokens": 19330102.0, "reward": 2.1767578125, "reward_std": 0.20922625064849854, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.387910932302475, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1703.0, "completions/mean_length": 687.23046875, "completions/mean_terminated_length": 681.8941650390625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.015020909789195187, "frac_reward_zero_std": 0.375, "grad_norm": 0.25629215202067057, "kl": 0.01251220703125, "learning_rate": 2.969283276450512e-06, "loss": 0.0331, "num_tokens": 19543745.0, "reward": 2.4072265625, "reward_std": 0.2777283191680908, "rewards/accuracy_reward/mean": 0.4140625, "rewards/accuracy_reward/std": 0.4935242533683777, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 647.33984375, "completions/mean_terminated_length": 647.33984375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.015191601945890587, "frac_reward_zero_std": 0.8125, "grad_norm": 0.19936329312197484, "kl": 0.0143280029296875, "learning_rate": 3.003412969283277e-06, "loss": -0.0007, "num_tokens": 19759368.0, "reward": 2.1279296875, "reward_std": 0.07619576156139374, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1760.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 746.09375, "completions/mean_terminated_length": 746.09375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.015362294102585987, "frac_reward_zero_std": 0.625, "grad_norm": 0.25238998602641927, "kl": 0.0118255615234375, "learning_rate": 3.0375426621160415e-06, "loss": -0.0037, "num_tokens": 19987856.0, "reward": 2.1328125, "reward_std": 0.09375, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1878.0, "completions/mean_length": 682.64453125, "completions/mean_terminated_length": 677.2902221679688, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.015532986259281387, "frac_reward_zero_std": 0.5625, "grad_norm": 0.27951796310115196, "kl": 0.0142974853515625, "learning_rate": 3.0716723549488057e-06, "loss": 0.0282, "num_tokens": 20204277.0, "reward": 2.228515625, "reward_std": 0.2048923522233963, "rewards/accuracy_reward/mean": 0.23828125, "rewards/accuracy_reward/std": 0.4268665909767151, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.0661611557006836, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2021.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 687.17578125, "completions/mean_terminated_length": 687.17578125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.015703678415976786, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2585575337390732, "kl": 0.022003173828125, "learning_rate": 3.1058020477815703e-06, "loss": 0.0155, "num_tokens": 20418610.0, "reward": 2.20703125, "reward_std": 0.11838586628437042, "rewards/accuracy_reward/mean": 0.20703125, "rewards/accuracy_reward/std": 0.40597182512283325, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1810.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 737.375, "completions/mean_terminated_length": 737.375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.015874370572672186, "frac_reward_zero_std": 0.75, "grad_norm": 0.18489582050937683, "kl": 0.0125885009765625, "learning_rate": 3.139931740614335e-06, "loss": 0.0034, "num_tokens": 20647186.0, "reward": 2.078125, "reward_std": 0.11282351613044739, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1847.0, "completions/max_terminated_length": 1847.0, "completions/mean_length": 741.81640625, "completions/mean_terminated_length": 741.81640625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.016045062729367586, "frac_reward_zero_std": 0.625, "grad_norm": 0.26843169730452926, "kl": 0.017333984375, "learning_rate": 3.174061433447099e-06, "loss": -0.0007, "num_tokens": 20878739.0, "reward": 2.1904296875, "reward_std": 0.14585061371326447, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1672.0, "completions/mean_length": 791.75390625, "completions/mean_terminated_length": 786.8275146484375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.016215754886062986, "frac_reward_zero_std": 0.75, "grad_norm": 0.17254414867684537, "kl": 0.0138702392578125, "learning_rate": 3.2081911262798638e-06, "loss": 0.0325, "num_tokens": 21123028.0, "reward": 2.0517578125, "reward_std": 0.1086641326546669, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 800.51171875, "completions/mean_terminated_length": 790.68896484375, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.016386447042758386, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2605610556966682, "kl": 0.0174407958984375, "learning_rate": 3.2423208191126284e-06, "loss": -0.0057, "num_tokens": 21372695.0, "reward": 2.0908203125, "reward_std": 0.15713445842266083, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31272050738334656, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06789661198854446, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 807.7421875, "completions/mean_terminated_length": 793.0355834960938, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.016557139199453785, "frac_reward_zero_std": 0.8125, "grad_norm": 0.1577528747965402, "kl": 0.0130462646484375, "learning_rate": 3.2764505119453926e-06, "loss": 0.0185, "num_tokens": 21619765.0, "reward": 2.1728515625, "reward_std": 0.08444623649120331, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.06789661198854446, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 805.2265625, "completions/mean_terminated_length": 800.3529663085938, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.016727831356149185, "frac_reward_zero_std": 0.75, "grad_norm": 0.15275441436849826, "kl": 0.0128021240234375, "learning_rate": 3.310580204778157e-06, "loss": 0.0121, "num_tokens": 21864639.0, "reward": 2.064453125, "reward_std": 0.09670154750347137, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.0661611557006836, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1629.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 702.01953125, "completions/mean_terminated_length": 702.01953125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.016898523512844585, "frac_reward_zero_std": 0.875, "grad_norm": 0.18083599052173824, "kl": 0.018402099609375, "learning_rate": 3.3447098976109214e-06, "loss": -0.0012, "num_tokens": 22082228.0, "reward": 2.1162109375, "reward_std": 0.0607106052339077, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3268752694129944, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 723.54296875, "completions/mean_terminated_length": 713.1141967773438, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.017069215669539985, "frac_reward_zero_std": 0.4375, "grad_norm": 0.24884278430409099, "kl": 0.0155029296875, "learning_rate": 3.378839590443686e-06, "loss": 0.026, "num_tokens": 22303279.0, "reward": 2.2392578125, "reward_std": 0.23562178015708923, "rewards/accuracy_reward/mean": 0.25390625, "rewards/accuracy_reward/std": 0.4360972046852112, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1485.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 692.34765625, "completions/mean_terminated_length": 692.34765625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.017239907826235384, "frac_reward_zero_std": 0.625, "grad_norm": 0.20248916577203832, "kl": 0.0167236328125, "learning_rate": 3.412969283276451e-06, "loss": -0.0018, "num_tokens": 22523336.0, "reward": 2.24609375, "reward_std": 0.15691795945167542, "rewards/accuracy_reward/mean": 0.24609375, "rewards/accuracy_reward/std": 0.43157756328582764, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1734.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 756.94921875, "completions/mean_terminated_length": 756.94921875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.017410599982930784, "frac_reward_zero_std": 0.8125, "grad_norm": 0.13398770139223934, "kl": 0.0148773193359375, "learning_rate": 3.4470989761092157e-06, "loss": -0.0002, "num_tokens": 22759163.0, "reward": 2.078125, "reward_std": 0.062167368829250336, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 708.84375, "completions/mean_terminated_length": 703.5922241210938, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.017581292139626184, "frac_reward_zero_std": 0.75, "grad_norm": 0.24473145623008263, "kl": 0.0200958251953125, "learning_rate": 3.48122866894198e-06, "loss": 0.0108, "num_tokens": 22982915.0, "reward": 2.0458984375, "reward_std": 0.10204866528511047, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21998079121112823, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 786.7421875, "completions/mean_terminated_length": 786.7421875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.017751984296321584, "frac_reward_zero_std": 0.625, "grad_norm": 0.24070076801131474, "kl": 0.011688232421875, "learning_rate": 3.5153583617747445e-06, "loss": 0.0038, "num_tokens": 23225985.0, "reward": 2.1025390625, "reward_std": 0.11933804303407669, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3077581524848938, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2030.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 819.24609375, "completions/mean_terminated_length": 819.24609375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.017922676453016984, "frac_reward_zero_std": 0.625, "grad_norm": 0.1545769997964289, "kl": 0.0134429931640625, "learning_rate": 3.5494880546075087e-06, "loss": 0.0022, "num_tokens": 23477504.0, "reward": 2.15625, "reward_std": 0.15303051471710205, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 605.16015625, "completions/mean_terminated_length": 599.5020141601562, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.018093368609712383, "frac_reward_zero_std": 0.625, "grad_norm": 0.24626583528049076, "kl": 0.0157623291015625, "learning_rate": 3.5836177474402733e-06, "loss": 0.0211, "num_tokens": 23670425.0, "reward": 2.1845703125, "reward_std": 0.1715986132621765, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1747.0, "completions/max_terminated_length": 1747.0, "completions/mean_length": 695.2109375, "completions/mean_terminated_length": 695.2109375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.018264060766407783, "frac_reward_zero_std": 0.4375, "grad_norm": 0.2886175290601021, "kl": 0.014892578125, "learning_rate": 3.617747440273038e-06, "loss": -0.0001, "num_tokens": 23891087.0, "reward": 2.25390625, "reward_std": 0.23292501270771027, "rewards/accuracy_reward/mean": 0.25390625, "rewards/accuracy_reward/std": 0.4360972046852112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 557.80859375, "completions/mean_terminated_length": 551.9647216796875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.018434752923103183, "frac_reward_zero_std": 0.75, "grad_norm": 0.21873212866166208, "kl": 0.0167236328125, "learning_rate": 3.651877133105802e-06, "loss": 0.0202, "num_tokens": 24079838.0, "reward": 2.0595703125, "reward_std": 0.11606316268444061, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 767.73828125, "completions/mean_terminated_length": 757.657470703125, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.018605445079798583, "frac_reward_zero_std": 0.375, "grad_norm": 0.2487816396792799, "kl": 0.011749267578125, "learning_rate": 3.6860068259385667e-06, "loss": 0.0154, "num_tokens": 24315355.0, "reward": 2.162109375, "reward_std": 0.2882852554321289, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.986328125, "rewards/tag_count_reward/std": 0.0929294154047966, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1428.0, "completions/max_terminated_length": 1428.0, "completions/mean_length": 792.3125, "completions/mean_terminated_length": 792.3125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.018776137236493982, "frac_reward_zero_std": 0.5625, "grad_norm": 0.20037786666033028, "kl": 0.010955810546875, "learning_rate": 3.7201365187713314e-06, "loss": -0.002, "num_tokens": 24557707.0, "reward": 2.1630859375, "reward_std": 0.18660128116607666, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.3745708465576172, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1350.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 646.296875, "completions/mean_terminated_length": 646.296875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.018946829393189382, "frac_reward_zero_std": 0.5, "grad_norm": 0.23514842058028607, "kl": 0.0132598876953125, "learning_rate": 3.7542662116040956e-06, "loss": -0.0048, "num_tokens": 24763063.0, "reward": 2.1484375, "reward_std": 0.18188363313674927, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.3562295734882355, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1296.0, "completions/max_terminated_length": 1296.0, "completions/mean_length": 617.875, "completions/mean_terminated_length": 617.875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.019117521549884782, "frac_reward_zero_std": 0.625, "grad_norm": 0.2060489108390196, "kl": 0.0142822265625, "learning_rate": 3.78839590443686e-06, "loss": 0.0047, "num_tokens": 24964167.0, "reward": 2.12890625, "reward_std": 0.1204879954457283, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1866.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 754.9609375, "completions/mean_terminated_length": 754.9609375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.019288213706580182, "frac_reward_zero_std": 0.8125, "grad_norm": 0.13769842564671356, "kl": 0.01336669921875, "learning_rate": 3.822525597269625e-06, "loss": -0.0031, "num_tokens": 25200093.0, "reward": 2.125, "reward_std": 0.0873890146613121, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1915.0, "completions/max_terminated_length": 1915.0, "completions/mean_length": 779.2890625, "completions/mean_terminated_length": 779.2890625, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.01945890586327558, "frac_reward_zero_std": 0.5625, "grad_norm": 0.27220059984401745, "kl": 0.015625, "learning_rate": 3.8566552901023894e-06, "loss": 0.0091, "num_tokens": 25440487.0, "reward": 2.142578125, "reward_std": 0.19816526770591736, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.3710577189922333, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.0539139099419117, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1982.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 706.98828125, "completions/mean_terminated_length": 706.98828125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.01962959801997098, "frac_reward_zero_std": 0.4375, "grad_norm": 0.31723573648716746, "kl": 0.0148773193359375, "learning_rate": 3.890784982935154e-06, "loss": 0.0068, "num_tokens": 25674388.0, "reward": 2.189453125, "reward_std": 0.22599273920059204, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1948.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 777.6875, "completions/mean_terminated_length": 777.6875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.01980029017666638, "frac_reward_zero_std": 0.75, "grad_norm": 0.1701501498608006, "kl": 0.0129241943359375, "learning_rate": 3.924914675767919e-06, "loss": 0.004, "num_tokens": 25918532.0, "reward": 2.0927734375, "reward_std": 0.12216898798942566, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29743078351020813, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 739.59765625, "completions/mean_terminated_length": 739.59765625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.01997098233336178, "frac_reward_zero_std": 0.5, "grad_norm": 0.5459787371211059, "kl": 0.023834228515625, "learning_rate": 3.959044368600683e-06, "loss": 0.0094, "num_tokens": 26145005.0, "reward": 2.212890625, "reward_std": 0.20464491844177246, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41420844197273254, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1455.0, "completions/max_terminated_length": 1455.0, "completions/mean_length": 700.9375, "completions/mean_terminated_length": 700.9375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.02014167449005718, "frac_reward_zero_std": 0.5, "grad_norm": 0.25087476038095513, "kl": 0.01702880859375, "learning_rate": 3.993174061433447e-06, "loss": 0.0184, "num_tokens": 26361373.0, "reward": 2.32421875, "reward_std": 0.22974562644958496, "rewards/accuracy_reward/mean": 0.32421875, "rewards/accuracy_reward/std": 0.46899911761283875, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1389.0, "completions/max_terminated_length": 1389.0, "completions/mean_length": 629.30078125, "completions/mean_terminated_length": 629.30078125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.02031236664675258, "frac_reward_zero_std": 0.5625, "grad_norm": 0.30389721634677247, "kl": 0.01806640625, "learning_rate": 4.027303754266212e-06, "loss": -0.0015, "num_tokens": 26565482.0, "reward": 2.171875, "reward_std": 0.17476484179496765, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.387910932302475, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.044107433408498764, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1891.0, "completions/max_terminated_length": 1891.0, "completions/mean_length": 552.6953125, "completions/mean_terminated_length": 552.6953125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.02048305880344798, "frac_reward_zero_std": 0.625, "grad_norm": 0.28275313355932785, "kl": 0.0301513671875, "learning_rate": 4.061433447098976e-06, "loss": -0.0132, "num_tokens": 26753020.0, "reward": 2.046875, "reward_std": 0.1443556547164917, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03814799711108208, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1261.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 601.67578125, "completions/mean_terminated_length": 601.67578125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.02065375096014338, "frac_reward_zero_std": 0.375, "grad_norm": 0.3734992929629062, "kl": 0.02874755859375, "learning_rate": 4.095563139931741e-06, "loss": -0.0167, "num_tokens": 26948233.0, "reward": 2.1005859375, "reward_std": 0.20315931737422943, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3268752694129944, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.05575593560934067, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1415.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 660.0703125, "completions/mean_terminated_length": 660.0703125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.02082444311683878, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2903466474706598, "kl": 0.02630615234375, "learning_rate": 4.1296928327645055e-06, "loss": 0.0048, "num_tokens": 27155787.0, "reward": 2.208984375, "reward_std": 0.18900346755981445, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4087733030319214, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1260.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 533.91796875, "completions/mean_terminated_length": 533.91796875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.02099513527353418, "frac_reward_zero_std": 0.375, "grad_norm": 0.3982589079367093, "kl": 0.03631591796875, "learning_rate": 4.163822525597269e-06, "loss": 0.0242, "num_tokens": 27332806.0, "reward": 2.2158203125, "reward_std": 0.215314120054245, "rewards/accuracy_reward/mean": 0.22265625, "rewards/accuracy_reward/std": 0.41684433817863464, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1071.0, "completions/max_terminated_length": 1071.0, "completions/mean_length": 512.796875, "completions/mean_terminated_length": 512.796875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.02116582743022958, "frac_reward_zero_std": 0.625, "grad_norm": 0.2505293866288354, "kl": 0.03009033203125, "learning_rate": 4.197952218430034e-06, "loss": -0.0084, "num_tokens": 27503298.0, "reward": 2.20703125, "reward_std": 0.13511523604393005, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4087733030319214, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 558.65234375, "completions/mean_terminated_length": 552.811767578125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.02133651958692498, "frac_reward_zero_std": 0.3125, "grad_norm": 0.4041540658582076, "kl": 0.0357666015625, "learning_rate": 4.232081911262799e-06, "loss": 0.0347, "num_tokens": 27689625.0, "reward": 2.19140625, "reward_std": 0.2629798948764801, "rewards/accuracy_reward/mean": 0.20703125, "rewards/accuracy_reward/std": 0.40597182512283325, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04935242608189583, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1330.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 644.40234375, "completions/mean_terminated_length": 644.40234375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.021507211743620382, "frac_reward_zero_std": 0.625, "grad_norm": 0.21416701022589021, "kl": 0.02935791015625, "learning_rate": 4.266211604095564e-06, "loss": 0.0031, "num_tokens": 27899968.0, "reward": 2.103515625, "reward_std": 0.16052621603012085, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31272050738334656, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 602.359375, "completions/mean_terminated_length": 602.359375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.021677903900315782, "frac_reward_zero_std": 0.625, "grad_norm": 0.2744837202085132, "kl": 0.029083251953125, "learning_rate": 4.300341296928328e-06, "loss": 0.0127, "num_tokens": 28099804.0, "reward": 2.2138671875, "reward_std": 0.1719234138727188, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41420844197273254, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 651.40625, "completions/mean_terminated_length": 651.40625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.021848596057011182, "frac_reward_zero_std": 0.4375, "grad_norm": 0.27584397981865516, "kl": 0.030059814453125, "learning_rate": 4.3344709897610924e-06, "loss": 0.0172, "num_tokens": 28305492.0, "reward": 2.361328125, "reward_std": 0.21823662519454956, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.4850712716579437, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.0661611557006836, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1232.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 610.81640625, "completions/mean_terminated_length": 610.81640625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.02201928821370658, "frac_reward_zero_std": 0.6875, "grad_norm": 0.24631638313204368, "kl": 0.0279541015625, "learning_rate": 4.368600682593857e-06, "loss": 0.0122, "num_tokens": 28503589.0, "reward": 2.2685546875, "reward_std": 0.1256745457649231, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.45048993825912476, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08950243145227432, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1887.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 739.35546875, "completions/mean_terminated_length": 739.35546875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.02218998037040198, "frac_reward_zero_std": 0.6875, "grad_norm": 0.17694265620869395, "kl": 0.025177001953125, "learning_rate": 4.402730375426622e-06, "loss": 0.0006, "num_tokens": 28736896.0, "reward": 2.2783203125, "reward_std": 0.10216960310935974, "rewards/accuracy_reward/mean": 0.29296875, "rewards/accuracy_reward/std": 0.45601576566696167, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 758.546875, "completions/mean_terminated_length": 753.490234375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.02236067252709738, "frac_reward_zero_std": 0.6875, "grad_norm": 0.17784007566596025, "kl": 0.023468017578125, "learning_rate": 4.436860068259386e-06, "loss": 0.0086, "num_tokens": 28974476.0, "reward": 2.1689453125, "reward_std": 0.13223382830619812, "rewards/accuracy_reward/mean": 0.17578125, "rewards/accuracy_reward/std": 0.3813795745372772, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1729.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 755.82421875, "completions/mean_terminated_length": 755.82421875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.02253136468379278, "frac_reward_zero_std": 0.9375, "grad_norm": 0.08420979151343362, "kl": 0.022674560546875, "learning_rate": 4.47098976109215e-06, "loss": -0.0007, "num_tokens": 29207551.0, "reward": 2.0546875, "reward_std": 0.021347813308238983, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22781464457511902, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 785.9921875, "completions/mean_terminated_length": 781.043212890625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.02270205684048818, "frac_reward_zero_std": 0.5, "grad_norm": 0.21116478031537636, "kl": 0.02520751953125, "learning_rate": 4.505119453924915e-06, "loss": 0.0157, "num_tokens": 29444333.0, "reward": 2.1845703125, "reward_std": 0.20197522640228271, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1619.0, "completions/max_terminated_length": 1619.0, "completions/mean_length": 800.15234375, "completions/mean_terminated_length": 800.15234375, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.02287274899718358, "frac_reward_zero_std": 0.5, "grad_norm": 0.26020257483520637, "kl": 0.0235595703125, "learning_rate": 4.539249146757679e-06, "loss": 0.0096, "num_tokens": 29687700.0, "reward": 2.2275390625, "reward_std": 0.18986794352531433, "rewards/accuracy_reward/mean": 0.2421875, "rewards/accuracy_reward/std": 0.4292463958263397, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 697.51171875, "completions/mean_terminated_length": 697.51171875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.02304344115387898, "frac_reward_zero_std": 0.5, "grad_norm": 0.2842143862666158, "kl": 0.027191162109375, "learning_rate": 4.573378839590444e-06, "loss": 0.0178, "num_tokens": 29904823.0, "reward": 2.1826171875, "reward_std": 0.19908671081066132, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 758.72265625, "completions/mean_terminated_length": 753.6666870117188, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.02321413331057438, "frac_reward_zero_std": 0.5, "grad_norm": 0.2536315934864252, "kl": 0.028472900390625, "learning_rate": 4.6075085324232085e-06, "loss": 0.0089, "num_tokens": 30143424.0, "reward": 2.2978515625, "reward_std": 0.1768869310617447, "rewards/accuracy_reward/mean": 0.3046875, "rewards/accuracy_reward/std": 0.4611765742301941, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1989.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 792.30078125, "completions/mean_terminated_length": 792.30078125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.02338482546726978, "frac_reward_zero_std": 0.6875, "grad_norm": 0.24378645817400807, "kl": 0.029754638671875, "learning_rate": 4.641638225255973e-06, "loss": -0.0061, "num_tokens": 30387469.0, "reward": 2.0126953125, "reward_std": 0.11417396366596222, "rewards/accuracy_reward/mean": 0.02916666679084301, "rewards/accuracy_reward/std": 0.16862517595291138, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1878.0, "completions/mean_length": 794.2109375, "completions/mean_terminated_length": 774.3095703125, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.02355551762396518, "frac_reward_zero_std": 0.625, "grad_norm": 0.2168470699583023, "kl": 0.026763916015625, "learning_rate": 4.675767918088738e-06, "loss": 0.047, "num_tokens": 30639619.0, "reward": 2.1005859375, "reward_std": 0.18778201937675476, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3483152687549591, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.08623361587524414, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1767.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 724.53515625, "completions/mean_terminated_length": 724.53515625, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.02372620978066058, "frac_reward_zero_std": 0.625, "grad_norm": 0.33197074447984803, "kl": 0.02777099609375, "learning_rate": 4.709897610921502e-06, "loss": 0.006, "num_tokens": 30865372.0, "reward": 2.1630859375, "reward_std": 0.12339647114276886, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.3745708465576172, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 802.32421875, "completions/mean_terminated_length": 797.4392700195312, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.02389690193735598, "frac_reward_zero_std": 0.5, "grad_norm": 0.214781018971746, "kl": 0.02520751953125, "learning_rate": 4.744027303754267e-06, "loss": 0.0042, "num_tokens": 31114767.0, "reward": 2.2333984375, "reward_std": 0.20668752491474152, "rewards/accuracy_reward/mean": 0.23828125, "rewards/accuracy_reward/std": 0.4268665909767151, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 711.69921875, "completions/mean_terminated_length": 711.69921875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.02406759409405138, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3492578237345356, "kl": 0.0333251953125, "learning_rate": 4.778156996587031e-06, "loss": 0.0263, "num_tokens": 31337442.0, "reward": 2.1279296875, "reward_std": 0.1717112511396408, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3483152687549591, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05169277638196945, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1697.0, "completions/max_terminated_length": 1697.0, "completions/mean_length": 714.80859375, "completions/mean_terminated_length": 714.80859375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.02423828625074678, "frac_reward_zero_std": 0.8125, "grad_norm": 0.13871579725440533, "kl": 0.03265380859375, "learning_rate": 4.812286689419795e-06, "loss": 0.0039, "num_tokens": 31561713.0, "reward": 2.09375, "reward_std": 0.07856409251689911, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.2920515835285187, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1447.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 673.21484375, "completions/mean_terminated_length": 673.21484375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.02440897840744218, "frac_reward_zero_std": 0.625, "grad_norm": 0.24275981846354905, "kl": 0.03509521484375, "learning_rate": 4.84641638225256e-06, "loss": 0.0157, "num_tokens": 31773528.0, "reward": 2.14453125, "reward_std": 0.12333697080612183, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35231640934944153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 690.484375, "completions/mean_terminated_length": 690.484375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.02457967056413758, "frac_reward_zero_std": 0.6875, "grad_norm": 0.21804503903034378, "kl": 0.031097412109375, "learning_rate": 4.880546075085325e-06, "loss": 0.0042, "num_tokens": 32002580.0, "reward": 2.23046875, "reward_std": 0.14736157655715942, "rewards/accuracy_reward/mean": 0.23046875, "rewards/accuracy_reward/std": 0.4219578504562378, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1846.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 655.34765625, "completions/mean_terminated_length": 655.34765625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.024750362720832978, "frac_reward_zero_std": 0.6875, "grad_norm": 0.20302447834249934, "kl": 0.030426025390625, "learning_rate": 4.914675767918089e-06, "loss": -0.001, "num_tokens": 32222781.0, "reward": 2.1171875, "reward_std": 0.1281953752040863, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.3222736418247223, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1495.0, "completions/max_terminated_length": 1495.0, "completions/mean_length": 649.5078125, "completions/mean_terminated_length": 649.5078125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.024921054877528378, "frac_reward_zero_std": 0.75, "grad_norm": 0.21406180578154152, "kl": 0.03509521484375, "learning_rate": 4.948805460750853e-06, "loss": 0.0101, "num_tokens": 32428607.0, "reward": 2.15625, "reward_std": 0.11123998463153839, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 614.32421875, "completions/mean_terminated_length": 608.7020263671875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.025091747034223778, "frac_reward_zero_std": 0.5625, "grad_norm": 0.28695163280259384, "kl": 0.03289794921875, "learning_rate": 4.982935153583618e-06, "loss": 0.0322, "num_tokens": 32626978.0, "reward": 2.1943359375, "reward_std": 0.1991184800863266, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1310.0, "completions/max_terminated_length": 1310.0, "completions/mean_length": 610.9609375, "completions/mean_terminated_length": 610.9609375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.025262439190919177, "frac_reward_zero_std": 0.625, "grad_norm": 0.20635939969010134, "kl": 0.03350830078125, "learning_rate": 5.017064846416383e-06, "loss": 0.0085, "num_tokens": 32821784.0, "reward": 2.19921875, "reward_std": 0.15151193737983704, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1402.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 670.55859375, "completions/mean_terminated_length": 670.55859375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.025433131347614577, "frac_reward_zero_std": 0.75, "grad_norm": 0.17625390025767632, "kl": 0.0283203125, "learning_rate": 5.051194539249147e-06, "loss": -0.0118, "num_tokens": 33031815.0, "reward": 2.08203125, "reward_std": 0.09452171623706818, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2749498784542084, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1549.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 651.02734375, "completions/mean_terminated_length": 651.02734375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.025603823504309977, "frac_reward_zero_std": 0.5625, "grad_norm": 0.24346912054746958, "kl": 0.0340576171875, "learning_rate": 5.0853242320819115e-06, "loss": -0.0026, "num_tokens": 33245742.0, "reward": 2.1201171875, "reward_std": 0.16588380932807922, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1202.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 633.59375, "completions/mean_terminated_length": 633.59375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.025774515661005377, "frac_reward_zero_std": 0.875, "grad_norm": 0.13056479340573707, "kl": 0.031890869140625, "learning_rate": 5.119453924914676e-06, "loss": 0.0043, "num_tokens": 33447942.0, "reward": 2.05859375, "reward_std": 0.06429658085107803, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 646.15234375, "completions/mean_terminated_length": 646.15234375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.025945207817700777, "frac_reward_zero_std": 0.4375, "grad_norm": 0.2646841740569757, "kl": 0.028656005859375, "learning_rate": 5.153583617747441e-06, "loss": -0.0026, "num_tokens": 33648669.0, "reward": 2.32421875, "reward_std": 0.22917568683624268, "rewards/accuracy_reward/mean": 0.32421875, "rewards/accuracy_reward/std": 0.46899911761283875, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1850.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 725.18359375, "completions/mean_terminated_length": 725.18359375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.026115899974396176, "frac_reward_zero_std": 0.625, "grad_norm": 0.2525861032475754, "kl": 0.032989501953125, "learning_rate": 5.1877133105802046e-06, "loss": 0.007, "num_tokens": 33876348.0, "reward": 2.19140625, "reward_std": 0.1487714797258377, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 693.296875, "completions/mean_terminated_length": 693.296875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.026286592131091576, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2284276794346169, "kl": 0.033416748046875, "learning_rate": 5.22184300341297e-06, "loss": 0.0059, "num_tokens": 34094936.0, "reward": 2.1083984375, "reward_std": 0.12065605819225311, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 629.20703125, "completions/mean_terminated_length": 629.20703125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.026457284287786976, "frac_reward_zero_std": 0.8125, "grad_norm": 0.17286938701224458, "kl": 0.0274658203125, "learning_rate": 5.255972696245735e-06, "loss": 0.0035, "num_tokens": 34309693.0, "reward": 2.1640625, "reward_std": 0.07206955552101135, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.3911280930042267, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04935242608189583, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 713.16796875, "completions/mean_terminated_length": 702.657470703125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.026627976444482376, "frac_reward_zero_std": 0.4375, "grad_norm": 0.29527317667724423, "kl": 0.03326416015625, "learning_rate": 5.290102389078498e-06, "loss": 0.008, "num_tokens": 34534952.0, "reward": 2.2412109375, "reward_std": 0.2634333372116089, "rewards/accuracy_reward/mean": 0.26171875, "rewards/accuracy_reward/std": 0.4404313564300537, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.0808708667755127, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1566.0, "completions/max_terminated_length": 1566.0, "completions/mean_length": 727.08984375, "completions/mean_terminated_length": 727.08984375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.026798668601177775, "frac_reward_zero_std": 0.75, "grad_norm": 0.2065668708151846, "kl": 0.029083251953125, "learning_rate": 5.324232081911264e-06, "loss": 0.0259, "num_tokens": 34766799.0, "reward": 2.1015625, "reward_std": 0.11641712486743927, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2033.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 716.7734375, "completions/mean_terminated_length": 716.7734375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.026969360757873175, "frac_reward_zero_std": 0.5625, "grad_norm": 0.22101564845855787, "kl": 0.031036376953125, "learning_rate": 5.358361774744028e-06, "loss": 0.0096, "num_tokens": 34990837.0, "reward": 2.1953125, "reward_std": 0.15911275148391724, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1455.0, "completions/max_terminated_length": 1455.0, "completions/mean_length": 621.46875, "completions/mean_terminated_length": 621.46875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.027140052914568575, "frac_reward_zero_std": 0.875, "grad_norm": 0.1333021761410628, "kl": 0.03302001953125, "learning_rate": 5.392491467576792e-06, "loss": 0.0002, "num_tokens": 35194797.0, "reward": 2.0263671875, "reward_std": 0.05180611088871956, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 709.9765625, "completions/mean_terminated_length": 709.9765625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.027310745071263975, "frac_reward_zero_std": 0.625, "grad_norm": 0.21665655922840596, "kl": 0.02960205078125, "learning_rate": 5.426621160409556e-06, "loss": 0.0113, "num_tokens": 35425063.0, "reward": 2.21484375, "reward_std": 0.15436092019081116, "rewards/accuracy_reward/mean": 0.21484375, "rewards/accuracy_reward/std": 0.4115184545516968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1641.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 757.796875, "completions/mean_terminated_length": 757.796875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.027481437227959375, "frac_reward_zero_std": 0.5625, "grad_norm": 0.19788488758785697, "kl": 0.026824951171875, "learning_rate": 5.4607508532423215e-06, "loss": -0.0193, "num_tokens": 35660531.0, "reward": 2.2216796875, "reward_std": 0.16026270389556885, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.41942715644836426, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1420.0, "completions/max_terminated_length": 1420.0, "completions/mean_length": 674.5234375, "completions/mean_terminated_length": 674.5234375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.027652129384654774, "frac_reward_zero_std": 0.4375, "grad_norm": 0.2672788218743463, "kl": 0.033538818359375, "learning_rate": 5.494880546075085e-06, "loss": 0.017, "num_tokens": 35869209.0, "reward": 2.35546875, "reward_std": 0.2327008694410324, "rewards/accuracy_reward/mean": 0.35546875, "rewards/accuracy_reward/std": 0.4795927405357361, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1756.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 710.328125, "completions/mean_terminated_length": 710.328125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.027822821541350174, "frac_reward_zero_std": 0.4375, "grad_norm": 0.2638372886255071, "kl": 0.03289794921875, "learning_rate": 5.529010238907851e-06, "loss": 0.0245, "num_tokens": 36093901.0, "reward": 2.28125, "reward_std": 0.20762386918067932, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.45048993825912476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1931.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 730.51953125, "completions/mean_terminated_length": 730.51953125, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.027993513698045574, "frac_reward_zero_std": 0.625, "grad_norm": 0.1980870769417551, "kl": 0.03118896484375, "learning_rate": 5.5631399317406145e-06, "loss": 0.0088, "num_tokens": 36323106.0, "reward": 2.23046875, "reward_std": 0.14109118282794952, "rewards/accuracy_reward/mean": 0.23046875, "rewards/accuracy_reward/std": 0.4219578504562378, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1359.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 653.49609375, "completions/mean_terminated_length": 653.49609375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.028164205854740974, "frac_reward_zero_std": 0.8125, "grad_norm": 0.16705328620350893, "kl": 0.03363037109375, "learning_rate": 5.597269624573379e-06, "loss": 0.0018, "num_tokens": 36534145.0, "reward": 2.0234375, "reward_std": 0.062167368829250336, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1473.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 746.578125, "completions/mean_terminated_length": 746.578125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.028334898011436373, "frac_reward_zero_std": 0.625, "grad_norm": 0.2771173245913011, "kl": 0.0340576171875, "learning_rate": 5.631399317406145e-06, "loss": -0.0012, "num_tokens": 36761301.0, "reward": 2.1240234375, "reward_std": 0.13791713118553162, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 734.93359375, "completions/mean_terminated_length": 729.7843627929688, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.028505590168131773, "frac_reward_zero_std": 0.625, "grad_norm": 0.20299008465085588, "kl": 0.0323486328125, "learning_rate": 5.665529010238908e-06, "loss": 0.0152, "num_tokens": 36985236.0, "reward": 2.1533203125, "reward_std": 0.1910880208015442, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.36746934056282043, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1683.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 662.46484375, "completions/mean_terminated_length": 662.46484375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.028676282324827173, "frac_reward_zero_std": 0.6875, "grad_norm": 0.27522045299870695, "kl": 0.0443115234375, "learning_rate": 5.699658703071673e-06, "loss": -0.0027, "num_tokens": 37199307.0, "reward": 2.12890625, "reward_std": 0.12151285260915756, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1478.0, "completions/max_terminated_length": 1478.0, "completions/mean_length": 634.1953125, "completions/mean_terminated_length": 634.1953125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.028846974481522573, "frac_reward_zero_std": 0.5625, "grad_norm": 0.25692600040129426, "kl": 0.03814697265625, "learning_rate": 5.733788395904437e-06, "loss": -0.0049, "num_tokens": 37401277.0, "reward": 2.20703125, "reward_std": 0.17582818865776062, "rewards/accuracy_reward/mean": 0.20703125, "rewards/accuracy_reward/std": 0.40597182512283325, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1641.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 740.140625, "completions/mean_terminated_length": 737.807861328125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.029017666638217973, "frac_reward_zero_std": 0.75, "grad_norm": 6626.919314332015, "kl": 144.0274658203125, "learning_rate": 5.767918088737202e-06, "loss": 5.7452, "num_tokens": 37632609.0, "reward": 2.08203125, "reward_std": 0.06822281330823898, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2749498784542084, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1444.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 681.18359375, "completions/mean_terminated_length": 681.18359375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.029188358794913372, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2512653609020258, "kl": 0.04046630859375, "learning_rate": 5.802047781569966e-06, "loss": 0.0205, "num_tokens": 37852192.0, "reward": 2.234375, "reward_std": 0.11431500315666199, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42443734407424927, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1253.0, "completions/max_terminated_length": 1253.0, "completions/mean_length": 689.95703125, "completions/mean_terminated_length": 689.95703125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.029359050951608772, "frac_reward_zero_std": 0.75, "grad_norm": 0.21348181369389063, "kl": 0.04156494140625, "learning_rate": 5.8361774744027315e-06, "loss": 0.0028, "num_tokens": 38074485.0, "reward": 2.07421875, "reward_std": 0.09287451207637787, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1542.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 626.6328125, "completions/mean_terminated_length": 626.6328125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.029529743108304172, "frac_reward_zero_std": 0.75, "grad_norm": 0.21662149359836272, "kl": 0.047607421875, "learning_rate": 5.870307167235495e-06, "loss": 0.0011, "num_tokens": 38277815.0, "reward": 2.263671875, "reward_std": 0.09890169650316238, "rewards/accuracy_reward/mean": 0.26953125, "rewards/accuracy_reward/std": 0.44458550214767456, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1713.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 588.71875, "completions/mean_terminated_length": 588.71875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.02970043526499957, "frac_reward_zero_std": 0.4375, "grad_norm": 0.351682240901431, "kl": 0.05328369140625, "learning_rate": 5.90443686006826e-06, "loss": -0.0111, "num_tokens": 38473119.0, "reward": 2.234375, "reward_std": 0.19673693180084229, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42443734407424927, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 612.81640625, "completions/mean_terminated_length": 612.81640625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.02987112742169497, "frac_reward_zero_std": 0.25, "grad_norm": 0.4048727587755034, "kl": 0.04901123046875, "learning_rate": 5.938566552901024e-06, "loss": -0.0093, "num_tokens": 38687552.0, "reward": 2.2431640625, "reward_std": 0.2722800374031067, "rewards/accuracy_reward/mean": 0.25390625, "rewards/accuracy_reward/std": 0.4360972046852112, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 683.23828125, "completions/mean_terminated_length": 677.8862915039062, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.030041819578390375, "frac_reward_zero_std": 0.6875, "grad_norm": 0.25903334456363547, "kl": 0.042724609375, "learning_rate": 5.972696245733789e-06, "loss": 0.0073, "num_tokens": 38906141.0, "reward": 2.1572265625, "reward_std": 0.13326166570186615, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 744.40234375, "completions/mean_terminated_length": 739.2902221679688, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.030212511735085774, "frac_reward_zero_std": 0.375, "grad_norm": 0.2630208158630101, "kl": 0.042724609375, "learning_rate": 6.006825938566554e-06, "loss": 0.0446, "num_tokens": 39137604.0, "reward": 2.21875, "reward_std": 0.23726484179496765, "rewards/accuracy_reward/mean": 0.23046875, "rewards/accuracy_reward/std": 0.4219578504562378, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04935242608189583, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 748.74609375, "completions/mean_terminated_length": 743.6510009765625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.030383203891781174, "frac_reward_zero_std": 0.5625, "grad_norm": 0.26032391729074006, "kl": 0.04815673828125, "learning_rate": 6.0409556313993175e-06, "loss": 0.0196, "num_tokens": 39367667.0, "reward": 2.0615234375, "reward_std": 0.17503029108047485, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05169277638196945, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1803.0, "completions/max_terminated_length": 1803.0, "completions/mean_length": 803.68359375, "completions/mean_terminated_length": 799.7647705078125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.030553896048476574, "frac_reward_zero_std": 0.8125, "grad_norm": 0.17223059181734682, "kl": 0.05126953125, "learning_rate": 6.075085324232083e-06, "loss": 0.0081, "num_tokens": 39616066.0, "reward": 2.08984375, "reward_std": 0.07284127175807953, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1824.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 775.6640625, "completions/mean_terminated_length": 775.6640625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.030724588205171974, "frac_reward_zero_std": 0.5, "grad_norm": 0.24336438212656752, "kl": 0.03643798828125, "learning_rate": 6.109215017064847e-06, "loss": 0.0004, "num_tokens": 39852668.0, "reward": 2.240234375, "reward_std": 0.1963537484407425, "rewards/accuracy_reward/mean": 0.25390625, "rewards/accuracy_reward/std": 0.4360972046852112, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 729.62890625, "completions/mean_terminated_length": 724.4588623046875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.030895280361867374, "frac_reward_zero_std": 0.4375, "grad_norm": 0.24456388982924862, "kl": 0.03912353515625, "learning_rate": 6.143344709897611e-06, "loss": 0.0334, "num_tokens": 40077181.0, "reward": 2.140625, "reward_std": 0.26631540060043335, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.05805254727602005, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1562.0, "completions/max_terminated_length": 1562.0, "completions/mean_length": 665.83203125, "completions/mean_terminated_length": 665.83203125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.031065972518562773, "frac_reward_zero_std": 0.4375, "grad_norm": 0.281803730853595, "kl": 0.04376220703125, "learning_rate": 6.177474402730376e-06, "loss": -0.0038, "num_tokens": 40290834.0, "reward": 2.189453125, "reward_std": 0.23681318759918213, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1483.0, "completions/max_terminated_length": 1483.0, "completions/mean_length": 808.0625, "completions/mean_terminated_length": 807.2353515625, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.031236664675258173, "frac_reward_zero_std": 0.3125, "grad_norm": 0.33240646850269073, "kl": 0.05609130859375, "learning_rate": 6.211604095563141e-06, "loss": 0.0236, "num_tokens": 40558466.0, "reward": 2.11328125, "reward_std": 0.29429784417152405, "rewards/accuracy_reward/mean": 0.1696428507566452, "rewards/accuracy_reward/std": 0.37615931034088135, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.05805254727602005, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1769.0, "completions/mean_length": 952.5390625, "completions/mean_terminated_length": 948.2431640625, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.03140735683195357, "frac_reward_zero_std": 0.375, "grad_norm": 0.22087365304788278, "kl": 0.030242919921875, "learning_rate": 6.245733788395904e-06, "loss": 0.0143, "num_tokens": 40839068.0, "reward": 2.0810546875, "reward_std": 0.27277693152427673, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3268752694129944, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.05999100208282471, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1297.0, "completions/max_terminated_length": 1297.0, "completions/mean_length": 651.08984375, "completions/mean_terminated_length": 651.08984375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.03157804898864897, "frac_reward_zero_std": 0.3125, "grad_norm": 0.34517103610795585, "kl": 0.04376220703125, "learning_rate": 6.27986348122867e-06, "loss": 0.0106, "num_tokens": 41042931.0, "reward": 2.12890625, "reward_std": 0.29743945598602295, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.3562295734882355, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1955.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 691.62109375, "completions/mean_terminated_length": 691.62109375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.03174874114534437, "frac_reward_zero_std": 0.375, "grad_norm": 0.30505244082992594, "kl": 0.03961181640625, "learning_rate": 6.313993174061434e-06, "loss": -0.001, "num_tokens": 41257570.0, "reward": 2.1142578125, "reward_std": 0.21444031596183777, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35231640934944153, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.040850620716810226, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1506.0, "completions/max_terminated_length": 1506.0, "completions/mean_length": 735.24609375, "completions/mean_terminated_length": 735.24609375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.03191943330203977, "frac_reward_zero_std": 0.5625, "grad_norm": 0.19885860756237236, "kl": 0.0372314453125, "learning_rate": 6.348122866894198e-06, "loss": -0.0046, "num_tokens": 41486769.0, "reward": 2.1025390625, "reward_std": 0.1622537225484848, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.3222736418247223, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1708.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 691.1328125, "completions/mean_terminated_length": 691.1328125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.03209012545873517, "frac_reward_zero_std": 0.5, "grad_norm": 0.30647600176585144, "kl": 0.047607421875, "learning_rate": 6.382252559726962e-06, "loss": 0.0089, "num_tokens": 41706019.0, "reward": 2.10546875, "reward_std": 0.20132558047771454, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1688.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 739.828125, "completions/mean_terminated_length": 739.828125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.03226081761543057, "frac_reward_zero_std": 0.5, "grad_norm": 0.20899180765047998, "kl": 0.0377197265625, "learning_rate": 6.4163822525597275e-06, "loss": -0.0065, "num_tokens": 41945463.0, "reward": 2.2490234375, "reward_std": 0.2213723212480545, "rewards/accuracy_reward/mean": 0.25390625, "rewards/accuracy_reward/std": 0.4360972046852112, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 708.51953125, "completions/mean_terminated_length": 708.51953125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.03243150977212597, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2551532599906762, "kl": 0.0447998046875, "learning_rate": 6.450511945392492e-06, "loss": -0.0004, "num_tokens": 42164492.0, "reward": 2.1884765625, "reward_std": 0.1517634093761444, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03488371521234512, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1304.0, "completions/max_terminated_length": 1304.0, "completions/mean_length": 650.1015625, "completions/mean_terminated_length": 650.1015625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.03260220192882137, "frac_reward_zero_std": 0.3125, "grad_norm": 0.2851995217423572, "kl": 0.04449462890625, "learning_rate": 6.484641638225257e-06, "loss": 0.0025, "num_tokens": 42365958.0, "reward": 2.333984375, "reward_std": 0.27621495723724365, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.47588926553726196, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1126.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 610.0, "completions/mean_terminated_length": 610.0, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.03277289408551677, "frac_reward_zero_std": 0.8125, "grad_norm": 0.21080466683015656, "kl": 0.05377197265625, "learning_rate": 6.518771331058021e-06, "loss": 0.0088, "num_tokens": 42562534.0, "reward": 2.0771484375, "reward_std": 0.06607361882925034, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2749498784542084, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1265.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 604.33984375, "completions/mean_terminated_length": 604.33984375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.03294358624221217, "frac_reward_zero_std": 0.6875, "grad_norm": 0.15846698990880667, "kl": 0.0467529296875, "learning_rate": 6.552901023890785e-06, "loss": -0.0047, "num_tokens": 42758077.0, "reward": 2.251953125, "reward_std": 0.13862939178943634, "rewards/accuracy_reward/mean": 0.26171875, "rewards/accuracy_reward/std": 0.4404313564300537, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1424.0, "completions/max_terminated_length": 1424.0, "completions/mean_length": 694.83203125, "completions/mean_terminated_length": 694.83203125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.03311427839890757, "frac_reward_zero_std": 0.5625, "grad_norm": 0.295824613288295, "kl": 0.0496826171875, "learning_rate": 6.587030716723551e-06, "loss": 0.0086, "num_tokens": 42976802.0, "reward": 2.06640625, "reward_std": 0.14753374457359314, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1943.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 796.46875, "completions/mean_terminated_length": 796.46875, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.03328497055560297, "frac_reward_zero_std": 0.625, "grad_norm": 0.20863387243539067, "kl": 0.04559326171875, "learning_rate": 6.621160409556314e-06, "loss": 0.0195, "num_tokens": 43226426.0, "reward": 2.1015625, "reward_std": 0.15461406111717224, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1504.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 631.8671875, "completions/mean_terminated_length": 631.8671875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.03345566271229837, "frac_reward_zero_std": 0.75, "grad_norm": 0.23745707973563895, "kl": 0.0560302734375, "learning_rate": 6.655290102389079e-06, "loss": 0.0173, "num_tokens": 43428984.0, "reward": 2.203125, "reward_std": 0.1000204086303711, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40311288833618164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1286.0, "completions/max_terminated_length": 1286.0, "completions/mean_length": 618.7109375, "completions/mean_terminated_length": 618.7109375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.03362635486899377, "frac_reward_zero_std": 0.625, "grad_norm": 0.3466704199395134, "kl": 0.05816650390625, "learning_rate": 6.689419795221843e-06, "loss": 0.0021, "num_tokens": 43626030.0, "reward": 2.1484375, "reward_std": 0.13281384110450745, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.3562295734882355, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1330.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 609.48046875, "completions/mean_terminated_length": 609.48046875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.03379704702568917, "frac_reward_zero_std": 0.5, "grad_norm": 0.31017912135894316, "kl": 0.05120849609375, "learning_rate": 6.723549488054608e-06, "loss": -0.0219, "num_tokens": 43825913.0, "reward": 2.0810546875, "reward_std": 0.17898981273174286, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1742.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 656.43359375, "completions/mean_terminated_length": 656.43359375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.03396773918238457, "frac_reward_zero_std": 0.75, "grad_norm": 0.2302899338348131, "kl": 0.0491943359375, "learning_rate": 6.757679180887372e-06, "loss": 0.0004, "num_tokens": 44034984.0, "reward": 2.1708984375, "reward_std": 0.07597580552101135, "rewards/accuracy_reward/mean": 0.17578125, "rewards/accuracy_reward/std": 0.3813795745372772, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1510.0, "completions/max_terminated_length": 1510.0, "completions/mean_length": 685.73828125, "completions/mean_terminated_length": 685.73828125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.03413843133907997, "frac_reward_zero_std": 0.625, "grad_norm": 0.18865987348642888, "kl": 0.04937744140625, "learning_rate": 6.7918088737201375e-06, "loss": 0.0012, "num_tokens": 44252277.0, "reward": 2.1015625, "reward_std": 0.15911275148391724, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1314.0, "completions/max_terminated_length": 1314.0, "completions/mean_length": 723.2890625, "completions/mean_terminated_length": 723.2890625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.03430912349577537, "frac_reward_zero_std": 0.8125, "grad_norm": 0.14421904473109068, "kl": 0.0462646484375, "learning_rate": 6.825938566552902e-06, "loss": 0.0068, "num_tokens": 44476015.0, "reward": 2.0478515625, "reward_std": 0.0901300460100174, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1685.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 605.2421875, "completions/mean_terminated_length": 603.4000244140625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.03447981565247077, "frac_reward_zero_std": 0.75, "grad_norm": 0.2401341828315565, "kl": 0.068359375, "learning_rate": 6.860068259385666e-06, "loss": -0.0071, "num_tokens": 44669277.0, "reward": 2.1123046875, "reward_std": 0.12826605141162872, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.3222736418247223, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 637.6484375, "completions/mean_terminated_length": 637.6484375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.03465050780916617, "frac_reward_zero_std": 0.75, "grad_norm": 0.16700152415529348, "kl": 0.05255126953125, "learning_rate": 6.894197952218431e-06, "loss": -0.0035, "num_tokens": 44872659.0, "reward": 2.11328125, "reward_std": 0.07779236882925034, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1841.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 690.12890625, "completions/mean_terminated_length": 690.12890625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.03482119996586157, "frac_reward_zero_std": 0.6875, "grad_norm": 0.22578843062449236, "kl": 0.05181884765625, "learning_rate": 6.928327645051195e-06, "loss": 0.0072, "num_tokens": 45092004.0, "reward": 2.08984375, "reward_std": 0.13016413152217865, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1315.0, "completions/max_terminated_length": 1315.0, "completions/mean_length": 600.33984375, "completions/mean_terminated_length": 600.33984375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.03499189212255697, "frac_reward_zero_std": 0.5625, "grad_norm": 0.28124312068062507, "kl": 0.05487060546875, "learning_rate": 6.96245733788396e-06, "loss": 0.0204, "num_tokens": 45287019.0, "reward": 2.2294921875, "reward_std": 0.1868765652179718, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42443734407424927, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 658.8671875, "completions/mean_terminated_length": 657.2667236328125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.03516258427925237, "frac_reward_zero_std": 0.625, "grad_norm": 0.32417221625457543, "kl": 0.06378173828125, "learning_rate": 6.9965870307167235e-06, "loss": -0.0042, "num_tokens": 45497849.0, "reward": 2.12109375, "reward_std": 0.12469445914030075, "rewards/accuracy_reward/mean": 0.12916666269302368, "rewards/accuracy_reward/std": 0.3360852301120758, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1611.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 680.71484375, "completions/mean_terminated_length": 680.71484375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.03533327643594777, "frac_reward_zero_std": 0.5, "grad_norm": 0.19612395492182969, "kl": 0.04205322265625, "learning_rate": 7.030716723549489e-06, "loss": 0.0065, "num_tokens": 45710176.0, "reward": 2.23828125, "reward_std": 0.20843851566314697, "rewards/accuracy_reward/mean": 0.2541666626930237, "rewards/accuracy_reward/std": 0.43630167841911316, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1764.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 644.16015625, "completions/mean_terminated_length": 644.16015625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.03550396859264317, "frac_reward_zero_std": 0.5, "grad_norm": 0.30671000161790496, "kl": 0.05029296875, "learning_rate": 7.064846416382253e-06, "loss": 0.0346, "num_tokens": 45924217.0, "reward": 2.365234375, "reward_std": 0.2026320993900299, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.4850712716579437, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 726.796875, "completions/mean_terminated_length": 726.796875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.03567466074933857, "frac_reward_zero_std": 0.6875, "grad_norm": 0.211331564841483, "kl": 0.0537109375, "learning_rate": 7.098976109215017e-06, "loss": -0.0017, "num_tokens": 46146485.0, "reward": 2.15625, "reward_std": 0.1311618983745575, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1848.0, "completions/max_terminated_length": 1848.0, "completions/mean_length": 799.99609375, "completions/mean_terminated_length": 799.99609375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.03584535290603397, "frac_reward_zero_std": 0.875, "grad_norm": 0.09994471897951429, "kl": 0.0458984375, "learning_rate": 7.133105802047782e-06, "loss": 0.0056, "num_tokens": 46397092.0, "reward": 2.16015625, "reward_std": 0.053145404905080795, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.36746934056282043, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2013.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 720.95703125, "completions/mean_terminated_length": 720.95703125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.03601604506272937, "frac_reward_zero_std": 0.5, "grad_norm": 0.25547606561280667, "kl": 0.04718017578125, "learning_rate": 7.167235494880547e-06, "loss": 0.0056, "num_tokens": 46631321.0, "reward": 2.25, "reward_std": 0.2155844122171402, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4338609278202057, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 678.3125, "completions/mean_terminated_length": 678.3125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.03618673721942477, "frac_reward_zero_std": 0.625, "grad_norm": 0.18001280277151474, "kl": 0.049560546875, "learning_rate": 7.201365187713312e-06, "loss": 0.0067, "num_tokens": 46845865.0, "reward": 2.1943359375, "reward_std": 0.12959778308868408, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 710.69140625, "completions/mean_terminated_length": 705.4470825195312, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.036357429376120166, "frac_reward_zero_std": 0.6875, "grad_norm": 0.22793043506954483, "kl": 0.0426025390625, "learning_rate": 7.235494880546076e-06, "loss": 0.0337, "num_tokens": 47070026.0, "reward": 2.1240234375, "reward_std": 0.1310899555683136, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1473.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 692.41015625, "completions/mean_terminated_length": 689.3490600585938, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.036528121532815566, "frac_reward_zero_std": 0.5, "grad_norm": 0.7222097181467839, "kl": 0.06195068359375, "learning_rate": 7.2696245733788405e-06, "loss": -0.0048, "num_tokens": 47287907.0, "reward": 2.2333984375, "reward_std": 0.1850452572107315, "rewards/accuracy_reward/mean": 0.23828125, "rewards/accuracy_reward/std": 0.4268665909767151, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1850.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 809.69921875, "completions/mean_terminated_length": 809.69921875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.036698813689510966, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2159742405564356, "kl": 0.0440673828125, "learning_rate": 7.303754266211604e-06, "loss": 0.0118, "num_tokens": 47537862.0, "reward": 2.1953125, "reward_std": 0.1937704086303711, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1694.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 686.71484375, "completions/mean_terminated_length": 686.71484375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.036869505846206366, "frac_reward_zero_std": 0.4375, "grad_norm": 0.26450002289277524, "kl": 0.061279296875, "learning_rate": 7.33788395904437e-06, "loss": 0.0045, "num_tokens": 47758621.0, "reward": 2.146484375, "reward_std": 0.20524094998836517, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1424.0, "completions/max_terminated_length": 1424.0, "completions/mean_length": 608.24609375, "completions/mean_terminated_length": 608.24609375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.037040198002901766, "frac_reward_zero_std": 0.375, "grad_norm": 0.4144993461367294, "kl": 0.05810546875, "learning_rate": 7.3720136518771335e-06, "loss": -0.0025, "num_tokens": 47953436.0, "reward": 2.212890625, "reward_std": 0.27312833070755005, "rewards/accuracy_reward/mean": 0.22265625, "rewards/accuracy_reward/std": 0.41684433817863464, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 775.98828125, "completions/mean_terminated_length": 769.7874145507812, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.037210890159597165, "frac_reward_zero_std": 0.6875, "grad_norm": 0.20432398412084854, "kl": 0.06158447265625, "learning_rate": 7.406143344709898e-06, "loss": 0.0197, "num_tokens": 48190233.0, "reward": 2.0634765625, "reward_std": 0.1371578425168991, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2561737895011902, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 1689.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 746.8203125, "completions/mean_terminated_length": 743.5748291015625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.037381582316292565, "frac_reward_zero_std": 0.8125, "grad_norm": 0.10667534629987396, "kl": 0.08990478515625, "learning_rate": 7.440273037542663e-06, "loss": 0.0034, "num_tokens": 48419979.0, "reward": 2.1640625, "reward_std": 0.07349544763565063, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.3710577189922333, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 778.78515625, "completions/mean_terminated_length": 761.0361328125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.037552274472987965, "frac_reward_zero_std": 0.625, "grad_norm": 0.20894978683306817, "kl": 0.16937255859375, "learning_rate": 7.474402730375427e-06, "loss": -0.0066, "num_tokens": 48661492.0, "reward": 2.19921875, "reward_std": 0.16130754351615906, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 749.953125, "completions/mean_terminated_length": 728.1325073242188, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.037722966629683365, "frac_reward_zero_std": 0.5, "grad_norm": 0.32886860226640835, "kl": 0.162109375, "learning_rate": 7.508532423208191e-06, "loss": 0.0163, "num_tokens": 48896968.0, "reward": 2.2021484375, "reward_std": 0.20793652534484863, "rewards/accuracy_reward/mean": 0.20703125, "rewards/accuracy_reward/std": 0.40597182512283325, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 659.5625, "completions/mean_terminated_length": 655.8458862304688, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.037893658786378764, "frac_reward_zero_std": 0.5, "grad_norm": 0.7109142886886256, "kl": 0.08477783203125, "learning_rate": 7.542662116040957e-06, "loss": 0.0047, "num_tokens": 49104824.0, "reward": 2.2255859375, "reward_std": 0.18572431802749634, "rewards/accuracy_reward/mean": 0.23046875, "rewards/accuracy_reward/std": 0.4219578504562378, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.6875, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 728.19140625, "completions/mean_terminated_length": 708.2926635742188, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.038064350943074164, "frac_reward_zero_std": 0.6875, "grad_norm": 0.24051883040194502, "kl": 0.2080078125, "learning_rate": 7.57679180887372e-06, "loss": 0.0035, "num_tokens": 49329705.0, "reward": 2.1328125, "reward_std": 0.11146602779626846, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.59375, "completions/max_length": 1781.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 752.12109375, "completions/mean_terminated_length": 724.4773559570312, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.038235043099769564, "frac_reward_zero_std": 0.6875, "grad_norm": 0.22075985064051637, "kl": 0.257568359375, "learning_rate": 7.610921501706485e-06, "loss": -0.0012, "num_tokens": 49564344.0, "reward": 2.16796875, "reward_std": 0.12664085626602173, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.3745708465576172, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.71875, "completions/max_length": 1876.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 865.90234375, "completions/mean_terminated_length": 844.7490234375, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.038405735256464964, "frac_reward_zero_std": 0.625, "grad_norm": 0.43830646324071837, "kl": 0.1915283203125, "learning_rate": 7.64505119453925e-06, "loss": 0.0166, "num_tokens": 49830847.0, "reward": 2.1240234375, "reward_std": 0.1262454390525818, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 1464.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 588.203125, "completions/mean_terminated_length": 583.4703979492188, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.038576427413160363, "frac_reward_zero_std": 0.875, "grad_norm": 0.13533608962017626, "kl": 0.125732421875, "learning_rate": 7.679180887372013e-06, "loss": -0.0046, "num_tokens": 50016451.0, "reward": 2.1513671875, "reward_std": 0.05180611088871956, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.37345683574676514, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 1499.0, "completions/max_terminated_length": 1499.0, "completions/mean_length": 686.5078125, "completions/mean_terminated_length": 677.5079956054688, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.03874711956985576, "frac_reward_zero_std": 0.625, "grad_norm": 0.25228951352780543, "kl": 0.10662841796875, "learning_rate": 7.713310580204779e-06, "loss": -0.0014, "num_tokens": 50227941.0, "reward": 2.16796875, "reward_std": 0.13699321448802948, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.3745708465576172, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1918.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 723.8515625, "completions/mean_terminated_length": 724.5608520507812, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.03891781172655116, "frac_reward_zero_std": 0.625, "grad_norm": 3.365983199597984, "kl": 0.064453125, "learning_rate": 7.747440273037543e-06, "loss": 0.0263, "num_tokens": 50462015.0, "reward": 2.12890625, "reward_std": 0.12037044763565063, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 878.02734375, "completions/mean_terminated_length": 869.9487915039062, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.03908850388324656, "frac_reward_zero_std": 0.625, "grad_norm": 0.41404645024818626, "kl": 0.08203125, "learning_rate": 7.781569965870308e-06, "loss": 0.0324, "num_tokens": 50719638.0, "reward": 2.2119140625, "reward_std": 0.18167492747306824, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41420844197273254, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1907.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 730.7421875, "completions/mean_terminated_length": 730.7421875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.03925919603994196, "frac_reward_zero_std": 0.625, "grad_norm": 0.2373819044941149, "kl": 0.04718017578125, "learning_rate": 7.815699658703072e-06, "loss": 0.0147, "num_tokens": 50947700.0, "reward": 2.27734375, "reward_std": 0.12333696335554123, "rewards/accuracy_reward/mean": 0.27734375, "rewards/accuracy_reward/std": 0.4485645890235901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1712.0, "completions/max_terminated_length": 1712.0, "completions/mean_length": 786.76953125, "completions/mean_terminated_length": 786.76953125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.03942988819663736, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2011072296436743, "kl": 0.044189453125, "learning_rate": 7.849829351535837e-06, "loss": 0.0069, "num_tokens": 51186649.0, "reward": 2.1845703125, "reward_std": 0.15439525246620178, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1827.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 711.62890625, "completions/mean_terminated_length": 707.2549438476562, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.03960058035333276, "frac_reward_zero_std": 0.75, "grad_norm": 3.0602463103207636, "kl": 0.12286376953125, "learning_rate": 7.883959044368601e-06, "loss": 0.0184, "num_tokens": 51407114.0, "reward": 2.1259765625, "reward_std": 0.11621679365634918, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1582.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 601.91015625, "completions/mean_terminated_length": 601.91015625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.03977127251002816, "frac_reward_zero_std": 0.625, "grad_norm": 0.28061030988278896, "kl": 0.0638427734375, "learning_rate": 7.918088737201367e-06, "loss": 0.022, "num_tokens": 51601523.0, "reward": 2.30078125, "reward_std": 0.16682013869285583, "rewards/accuracy_reward/mean": 0.30078125, "rewards/accuracy_reward/std": 0.45949608087539673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1943.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 706.80859375, "completions/mean_terminated_length": 706.80859375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.03994196466672356, "frac_reward_zero_std": 0.8125, "grad_norm": 0.1054706202461205, "kl": 0.05401611328125, "learning_rate": 7.95221843003413e-06, "loss": 0.0164, "num_tokens": 51821042.0, "reward": 2.0859375, "reward_std": 0.0873890146613121, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 691.4375, "completions/mean_terminated_length": 686.11767578125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.04011265682341896, "frac_reward_zero_std": 0.5625, "grad_norm": 0.26830124777392805, "kl": 0.05633544921875, "learning_rate": 7.986348122866894e-06, "loss": 0.0361, "num_tokens": 52044018.0, "reward": 2.04296875, "reward_std": 0.16723985970020294, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22781464457511902, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04935242608189583, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1445.0, "completions/max_terminated_length": 1445.0, "completions/mean_length": 614.9140625, "completions/mean_terminated_length": 614.9140625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.04028334898011436, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2571253331260896, "kl": 0.0665283203125, "learning_rate": 8.02047781569966e-06, "loss": 0.0002, "num_tokens": 52240540.0, "reward": 2.216796875, "reward_std": 0.22714783251285553, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.41942715644836426, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1642.0, "completions/mean_length": 778.8125, "completions/mean_terminated_length": 773.8353271484375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.04045404113680976, "frac_reward_zero_std": 0.625, "grad_norm": 0.21795005101502185, "kl": 0.05712890625, "learning_rate": 8.054607508532423e-06, "loss": 0.0182, "num_tokens": 52487004.0, "reward": 2.150390625, "reward_std": 0.14315377175807953, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.36746934056282043, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1360.0, "completions/max_terminated_length": 1360.0, "completions/mean_length": 654.85546875, "completions/mean_terminated_length": 654.85546875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.04062473329350516, "frac_reward_zero_std": 0.5, "grad_norm": 0.26296552564796266, "kl": 0.068603515625, "learning_rate": 8.088737201365189e-06, "loss": 0.0169, "num_tokens": 52698727.0, "reward": 2.2109375, "reward_std": 0.19531384110450745, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4087733030319214, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 623.03515625, "completions/mean_terminated_length": 623.03515625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.04079542545020056, "frac_reward_zero_std": 0.5, "grad_norm": 0.2529134271468291, "kl": 0.072265625, "learning_rate": 8.122866894197953e-06, "loss": -0.0158, "num_tokens": 52895552.0, "reward": 2.20703125, "reward_std": 0.20228564739227295, "rewards/accuracy_reward/mean": 0.20703125, "rewards/accuracy_reward/std": 0.40597182512283325, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1811.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 609.2265625, "completions/mean_terminated_length": 609.2265625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.04096611760689596, "frac_reward_zero_std": 0.75, "grad_norm": 0.19165020346503056, "kl": 0.0731201171875, "learning_rate": 8.156996587030718e-06, "loss": -0.0001, "num_tokens": 53090538.0, "reward": 2.25390625, "reward_std": 0.1204879954457283, "rewards/accuracy_reward/mean": 0.25390625, "rewards/accuracy_reward/std": 0.4360972046852112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1596.0, "completions/mean_length": 720.11328125, "completions/mean_terminated_length": 714.9059448242188, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.04113680976359136, "frac_reward_zero_std": 0.5625, "grad_norm": 0.22791912893406524, "kl": 0.0584716796875, "learning_rate": 8.191126279863482e-06, "loss": 0.0103, "num_tokens": 53312119.0, "reward": 2.0888671875, "reward_std": 0.17466580867767334, "rewards/accuracy_reward/mean": 0.10833333432674408, "rewards/accuracy_reward/std": 0.31145045161247253, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.056234680116176605, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1584.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 651.359375, "completions/mean_terminated_length": 651.359375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.04130750192028676, "frac_reward_zero_std": 0.8125, "grad_norm": 0.21700786570274114, "kl": 0.0662841796875, "learning_rate": 8.225255972696247e-06, "loss": 0.005, "num_tokens": 53524115.0, "reward": 2.125, "reward_std": 0.0687704086303711, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1619.0, "completions/max_terminated_length": 1619.0, "completions/mean_length": 660.06640625, "completions/mean_terminated_length": 660.06640625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.04147819407698216, "frac_reward_zero_std": 0.875, "grad_norm": 0.11239783533413983, "kl": 0.06982421875, "learning_rate": 8.259385665529011e-06, "loss": 0.0138, "num_tokens": 53753060.0, "reward": 2.1875, "reward_std": 0.059839196503162384, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 785.140625, "completions/mean_terminated_length": 785.140625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.04164888623367756, "frac_reward_zero_std": 0.5625, "grad_norm": 0.19715189689658297, "kl": 0.052001953125, "learning_rate": 8.293515358361775e-06, "loss": 0.0198, "num_tokens": 53992232.0, "reward": 2.236328125, "reward_std": 0.1967989206314087, "rewards/accuracy_reward/mean": 0.24609375, "rewards/accuracy_reward/std": 0.43157756328582764, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1204.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 609.35546875, "completions/mean_terminated_length": 609.35546875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.04181957839037296, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3111788276042905, "kl": 0.0682373046875, "learning_rate": 8.327645051194539e-06, "loss": 0.0103, "num_tokens": 54191011.0, "reward": 2.1787109375, "reward_std": 0.16546693444252014, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.06436405330896378, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 672.72265625, "completions/mean_terminated_length": 667.3294677734375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.04199027054706836, "frac_reward_zero_std": 0.5, "grad_norm": 0.2460477896326605, "kl": 0.0592041015625, "learning_rate": 8.361774744027304e-06, "loss": 0.0328, "num_tokens": 54403932.0, "reward": 2.2080078125, "reward_std": 0.20515653491020203, "rewards/accuracy_reward/mean": 0.21484375, "rewards/accuracy_reward/std": 0.4115184545516968, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 567.47265625, "completions/mean_terminated_length": 563.682373046875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.04216096270376376, "frac_reward_zero_std": 0.5625, "grad_norm": 76.20447948942623, "kl": 3.3477783203125, "learning_rate": 8.395904436860068e-06, "loss": 0.1445, "num_tokens": 54588693.0, "reward": 2.1669921875, "reward_std": 0.17248210310935974, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 563.1015625, "completions/mean_terminated_length": 563.1015625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.04233165486045916, "frac_reward_zero_std": 0.5625, "grad_norm": 0.23839180267353338, "kl": 0.0670166015625, "learning_rate": 8.430034129692833e-06, "loss": 0.0068, "num_tokens": 54771743.0, "reward": 2.1484375, "reward_std": 0.15096627175807953, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.3562295734882355, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1416.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 562.27734375, "completions/mean_terminated_length": 562.27734375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.04250234701715456, "frac_reward_zero_std": 0.625, "grad_norm": 0.24833098396045905, "kl": 0.071044921875, "learning_rate": 8.464163822525599e-06, "loss": 0.0201, "num_tokens": 54962310.0, "reward": 2.2265625, "reward_std": 0.137538880109787, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.41942715644836426, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1229.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 615.91796875, "completions/mean_terminated_length": 615.91796875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.04267303917384996, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2598703769437258, "kl": 0.0675048828125, "learning_rate": 8.498293515358363e-06, "loss": 0.0045, "num_tokens": 55165793.0, "reward": 2.15625, "reward_std": 0.2024868279695511, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1217.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 638.90234375, "completions/mean_terminated_length": 638.90234375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.042843731330545365, "frac_reward_zero_std": 0.625, "grad_norm": 0.2241556731246729, "kl": 0.0648193359375, "learning_rate": 8.532423208191128e-06, "loss": 0.0099, "num_tokens": 55369624.0, "reward": 2.2734375, "reward_std": 0.16771161556243896, "rewards/accuracy_reward/mean": 0.2734375, "rewards/accuracy_reward/std": 0.446596622467041, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 636.40234375, "completions/mean_terminated_length": 636.40234375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.043014423487240765, "frac_reward_zero_std": 0.8125, "grad_norm": 0.17911764168398395, "kl": 0.0701904296875, "learning_rate": 8.566552901023892e-06, "loss": -0.0075, "num_tokens": 55578463.0, "reward": 2.0703125, "reward_std": 0.0767945945262909, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2561737895011902, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 649.01171875, "completions/mean_terminated_length": 649.01171875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.043185115643936164, "frac_reward_zero_std": 0.75, "grad_norm": 0.18979663473837685, "kl": 0.06109619140625, "learning_rate": 8.600682593856656e-06, "loss": 0.0055, "num_tokens": 55781442.0, "reward": 2.19921875, "reward_std": 0.11741296947002411, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1325.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 630.16796875, "completions/mean_terminated_length": 630.16796875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.043355807800631564, "frac_reward_zero_std": 0.625, "grad_norm": 0.2207421055170197, "kl": 0.070068359375, "learning_rate": 8.63481228668942e-06, "loss": 0.0052, "num_tokens": 55982237.0, "reward": 2.13671875, "reward_std": 0.1613617241382599, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.34422317147254944, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1314.0, "completions/max_terminated_length": 1314.0, "completions/mean_length": 646.640625, "completions/mean_terminated_length": 646.640625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.043526499957326964, "frac_reward_zero_std": 0.8125, "grad_norm": 0.14441115645492555, "kl": 0.0606689453125, "learning_rate": 8.668941979522185e-06, "loss": 0.0059, "num_tokens": 56188865.0, "reward": 2.06640625, "reward_std": 0.07152669876813889, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1407.0, "completions/max_terminated_length": 1407.0, "completions/mean_length": 664.1328125, "completions/mean_terminated_length": 664.1328125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.043697192114022364, "frac_reward_zero_std": 0.75, "grad_norm": 0.1912024336195425, "kl": 0.060302734375, "learning_rate": 8.703071672354949e-06, "loss": -0.0009, "num_tokens": 56404627.0, "reward": 2.21875, "reward_std": 0.10550326108932495, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41420844197273254, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 678.25, "completions/mean_terminated_length": 672.8784790039062, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.043867884270717764, "frac_reward_zero_std": 0.625, "grad_norm": 0.22370955431328354, "kl": 0.0653076171875, "learning_rate": 8.737201365187714e-06, "loss": 0.0259, "num_tokens": 56622259.0, "reward": 2.1806640625, "reward_std": 0.17865866422653198, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 673.42578125, "completions/mean_terminated_length": 673.42578125, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.04403857642741316, "frac_reward_zero_std": 0.625, "grad_norm": 0.1937133718925755, "kl": 0.06793212890625, "learning_rate": 8.771331058020478e-06, "loss": 0.0083, "num_tokens": 56837056.0, "reward": 2.1435546875, "reward_std": 0.1332949995994568, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.3600577116012573, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1553.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 671.01953125, "completions/mean_terminated_length": 671.01953125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.04420926858410856, "frac_reward_zero_std": 0.875, "grad_norm": 0.11662125945287227, "kl": 0.0675048828125, "learning_rate": 8.805460750853243e-06, "loss": 0.0023, "num_tokens": 57053029.0, "reward": 2.11328125, "reward_std": 0.06219445914030075, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 636.77734375, "completions/mean_terminated_length": 634.98828125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.04437996074080396, "frac_reward_zero_std": 0.5, "grad_norm": 1.6206271959971483, "kl": 0.09564208984375, "learning_rate": 8.839590443686009e-06, "loss": 0.0144, "num_tokens": 57257308.0, "reward": 2.2626953125, "reward_std": 0.17814242839813232, "rewards/accuracy_reward/mean": 0.26953125, "rewards/accuracy_reward/std": 0.44458550214767456, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 636.35546875, "completions/mean_terminated_length": 636.35546875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.04455065289749936, "frac_reward_zero_std": 0.8125, "grad_norm": 0.20036699965701182, "kl": 0.0692138671875, "learning_rate": 8.873720136518773e-06, "loss": -0.0063, "num_tokens": 57459351.0, "reward": 2.0234375, "reward_std": 0.05920084938406944, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 718.50390625, "completions/mean_terminated_length": 718.50390625, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.04472134505419476, "frac_reward_zero_std": 0.5625, "grad_norm": 0.17224540752922918, "kl": 0.05804443359375, "learning_rate": 8.907849829351536e-06, "loss": 0.0013, "num_tokens": 57689128.0, "reward": 2.1513671875, "reward_std": 0.16904398798942566, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1441.0, "completions/max_terminated_length": 1441.0, "completions/mean_length": 693.5078125, "completions/mean_terminated_length": 693.5078125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.04489203721089016, "frac_reward_zero_std": 0.6875, "grad_norm": 0.19902538749208712, "kl": 0.0645751953125, "learning_rate": 8.9419795221843e-06, "loss": -0.0033, "num_tokens": 57911594.0, "reward": 2.134765625, "reward_std": 0.1203828901052475, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3483152687549591, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1756.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 748.2421875, "completions/mean_terminated_length": 748.2421875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.04506272936758556, "frac_reward_zero_std": 0.625, "grad_norm": 0.297542704537052, "kl": 0.06732177734375, "learning_rate": 8.976109215017066e-06, "loss": 0.0142, "num_tokens": 58151592.0, "reward": 2.125, "reward_std": 0.1128891110420227, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 781.0703125, "completions/mean_terminated_length": 781.0703125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.04523342152428096, "frac_reward_zero_std": 0.0, "grad_norm": 0.4741884547005608, "kl": 0.073974609375, "learning_rate": 9.01023890784983e-06, "loss": -0.0111, "num_tokens": 58389098.0, "reward": 0.9052734375, "reward_std": 0.7233741283416748, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.328125, "rewards/format_reward/std": 0.47045037150382996, "rewards/tag_count_reward/mean": 0.5654296875, "rewards/tag_count_reward/std": 0.4032541811466217, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1738.0, "completions/max_terminated_length": 1738.0, "completions/mean_length": 797.72265625, "completions/mean_terminated_length": 797.72265625, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.04540411368097636, "frac_reward_zero_std": 0.4375, "grad_norm": 0.24458076539293735, "kl": 0.05804443359375, "learning_rate": 9.044368600682595e-06, "loss": 0.0015, "num_tokens": 58635811.0, "reward": 2.1474609375, "reward_std": 0.21043968200683594, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1421.0, "completions/max_terminated_length": 1421.0, "completions/mean_length": 679.13671875, "completions/mean_terminated_length": 679.13671875, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.04557480583767176, "frac_reward_zero_std": 0.625, "grad_norm": 0.23301324909476945, "kl": 0.06439208984375, "learning_rate": 9.078498293515359e-06, "loss": 0.0087, "num_tokens": 58849590.0, "reward": 2.0927734375, "reward_std": 0.14200381934642792, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1653.0, "completions/max_terminated_length": 1653.0, "completions/mean_length": 714.62109375, "completions/mean_terminated_length": 714.62109375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.04574549799436716, "frac_reward_zero_std": 0.5, "grad_norm": 0.2147071900799785, "kl": 0.0670166015625, "learning_rate": 9.112627986348124e-06, "loss": 0.0162, "num_tokens": 59072245.0, "reward": 2.0703125, "reward_std": 0.15591737627983093, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1301.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 715.9921875, "completions/mean_terminated_length": 715.9921875, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.04591619015106256, "frac_reward_zero_std": 0.625, "grad_norm": 0.1970263128299155, "kl": 0.069580078125, "learning_rate": 9.146757679180888e-06, "loss": -0.0008, "num_tokens": 59295683.0, "reward": 2.25390625, "reward_std": 0.16683140397071838, "rewards/accuracy_reward/mean": 0.2578125, "rewards/accuracy_reward/std": 0.4382871091365814, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 637.37109375, "completions/mean_terminated_length": 626.2637939453125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.04608688230775796, "frac_reward_zero_std": 0.0, "grad_norm": 362628658.44042397, "kl": 23461888.061645508, "learning_rate": 9.180887372013653e-06, "loss": 940201.375, "num_tokens": 59499522.0, "reward": 2.146484375, "reward_std": 0.3979509174823761, "rewards/accuracy_reward/mean": 0.26171875, "rewards/accuracy_reward/std": 0.4404313564300537, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26889389753341675, "rewards/tag_count_reward/mean": 0.962890625, "rewards/tag_count_reward/std": 0.1065954640507698, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 1714.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 652.70703125, "completions/mean_terminated_length": 647.6495971679688, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.04625757446445336, "frac_reward_zero_std": 0.0625, "grad_norm": 16807263.524313748, "kl": 1171456.068725586, "learning_rate": 9.215017064846417e-06, "loss": 47022.0547, "num_tokens": 59706279.0, "reward": 2.056640625, "reward_std": 0.4115876853466034, "rewards/accuracy_reward/mean": 0.2541666626930237, "rewards/accuracy_reward/std": 0.43630167841911316, "rewards/format_reward/mean": 0.87109375, "rewards/format_reward/std": 0.33575257658958435, "rewards/tag_count_reward/mean": 0.947265625, "rewards/tag_count_reward/std": 0.11569205671548843, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2015.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 782.984375, "completions/mean_terminated_length": 782.984375, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.04642826662114876, "frac_reward_zero_std": 0.0, "grad_norm": 0.39133221746341224, "kl": 0.099365234375, "learning_rate": 9.249146757679181e-06, "loss": 0.0094, "num_tokens": 59951299.0, "reward": 1.685546875, "reward_std": 0.5807268619537354, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 0.73828125, "rewards/format_reward/std": 0.4404313564300537, "rewards/tag_count_reward/mean": 0.861328125, "rewards/tag_count_reward/std": 0.17095062136650085, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2002.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 777.58984375, "completions/mean_terminated_length": 776.3412475585938, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.04659895877784416, "frac_reward_zero_std": 0.0, "grad_norm": 0.46869935206957264, "kl": 0.1922607421875, "learning_rate": 9.283276450511946e-06, "loss": 0.0342, "num_tokens": 60193514.0, "reward": 1.908203125, "reward_std": 0.30047526955604553, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2749498784542084, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.841796875, "rewards/tag_count_reward/std": 0.19012662768363953, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1879.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 745.96875, "completions/mean_terminated_length": 744.8510131835938, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.04676965093453956, "frac_reward_zero_std": 0.625, "grad_norm": 1.1243051902204602, "kl": 0.08203125, "learning_rate": 9.31740614334471e-06, "loss": 0.0444, "num_tokens": 60458482.0, "reward": 2.2138671875, "reward_std": 0.13948200643062592, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.41942715644836426, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05169277638196945, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 648.37890625, "completions/mean_terminated_length": 636.5889282226562, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.04694034309123496, "frac_reward_zero_std": 0.5, "grad_norm": 0.23648419595360914, "kl": 0.08056640625, "learning_rate": 9.351535836177476e-06, "loss": 0.0413, "num_tokens": 60667171.0, "reward": 2.1435546875, "reward_std": 0.15705236792564392, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.3710577189922333, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.0639462023973465, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1483.0, "completions/max_terminated_length": 1483.0, "completions/mean_length": 674.97265625, "completions/mean_terminated_length": 674.97265625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.04711103524793036, "frac_reward_zero_std": 0.4375, "grad_norm": 0.27518735983605186, "kl": 0.0703125, "learning_rate": 9.38566552901024e-06, "loss": 0.0037, "num_tokens": 60874252.0, "reward": 2.09765625, "reward_std": 0.21519392728805542, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3268752694129944, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.06957504153251648, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 780.6484375, "completions/mean_terminated_length": 770.6693115234375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.04728172740462576, "frac_reward_zero_std": 0.375, "grad_norm": 0.2293260796547384, "kl": 0.0694580078125, "learning_rate": 9.419795221843005e-06, "loss": 0.039, "num_tokens": 61115474.0, "reward": 2.1787109375, "reward_std": 0.2008727341890335, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40311288833618164, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.0639462023973465, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 605.12890625, "completions/mean_terminated_length": 602.8306884765625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.04745241956132116, "frac_reward_zero_std": 0.75, "grad_norm": 7.314203691279384, "kl": 0.852783203125, "learning_rate": 9.453924914675769e-06, "loss": 0.0383, "num_tokens": 61316051.0, "reward": 2.0634765625, "reward_std": 0.08842628449201584, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 615.6171875, "completions/mean_terminated_length": 615.6171875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.04762311171801656, "frac_reward_zero_std": 0.8125, "grad_norm": 0.21686259118383538, "kl": 0.0926513671875, "learning_rate": 9.488054607508534e-06, "loss": 0.0085, "num_tokens": 61512577.0, "reward": 2.0654296875, "reward_std": 0.050448618829250336, "rewards/accuracy_reward/mean": 0.07083333283662796, "rewards/accuracy_reward/std": 0.25708237290382385, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1760.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 583.73046875, "completions/mean_terminated_length": 583.73046875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.04779380387471196, "frac_reward_zero_std": 0.5, "grad_norm": 0.30830209501627426, "kl": 0.0897216796875, "learning_rate": 9.522184300341298e-06, "loss": 0.0019, "num_tokens": 61700124.0, "reward": 2.20703125, "reward_std": 0.1822051852941513, "rewards/accuracy_reward/mean": 0.20703125, "rewards/accuracy_reward/std": 0.40597182512283325, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1449.0, "completions/max_terminated_length": 1449.0, "completions/mean_length": 614.8046875, "completions/mean_terminated_length": 614.8046875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.04796449603140736, "frac_reward_zero_std": 0.625, "grad_norm": 0.23630733069278256, "kl": 0.0965576171875, "learning_rate": 9.556313993174062e-06, "loss": 0.0105, "num_tokens": 61899642.0, "reward": 2.130859375, "reward_std": 0.16045024991035461, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.04388983175158501, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1322.0, "completions/max_terminated_length": 1322.0, "completions/mean_length": 526.86328125, "completions/mean_terminated_length": 526.86328125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.04813518818810276, "frac_reward_zero_std": 0.375, "grad_norm": 0.41773665657396253, "kl": 0.09912109375, "learning_rate": 9.590443686006825e-06, "loss": 0.0104, "num_tokens": 62075527.0, "reward": 2.2001953125, "reward_std": 0.24824923276901245, "rewards/accuracy_reward/mean": 0.21484375, "rewards/accuracy_reward/std": 0.4115184545516968, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 629.70703125, "completions/mean_terminated_length": 629.70703125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.04830588034479816, "frac_reward_zero_std": 0.4375, "grad_norm": 0.2758198465108124, "kl": 0.0860595703125, "learning_rate": 9.62457337883959e-06, "loss": -0.0005, "num_tokens": 62278780.0, "reward": 2.21875, "reward_std": 0.25553953647613525, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41420844197273254, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1111.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 539.01953125, "completions/mean_terminated_length": 539.01953125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.04847657250149356, "frac_reward_zero_std": 0.6875, "grad_norm": 0.24151497920190568, "kl": 0.09912109375, "learning_rate": 9.658703071672356e-06, "loss": 0.0063, "num_tokens": 62463617.0, "reward": 2.1875, "reward_std": 0.11117157340049744, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1316.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 573.8671875, "completions/mean_terminated_length": 573.8671875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.04864726465818896, "frac_reward_zero_std": 0.5625, "grad_norm": 0.30392561081507424, "kl": 0.104248046875, "learning_rate": 9.69283276450512e-06, "loss": 0.0193, "num_tokens": 62647999.0, "reward": 2.19921875, "reward_std": 0.19882795214653015, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1652.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 609.40234375, "completions/mean_terminated_length": 607.0392456054688, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.04881795681488436, "frac_reward_zero_std": 0.5, "grad_norm": 0.7054032067455355, "kl": 0.198486328125, "learning_rate": 9.726962457337886e-06, "loss": 0.025, "num_tokens": 62845366.0, "reward": 2.125, "reward_std": 0.17703911662101746, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.34422317147254944, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04935242608189583, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1250.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 564.65234375, "completions/mean_terminated_length": 564.65234375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.04898864897157976, "frac_reward_zero_std": 0.9375, "grad_norm": 0.10497498316033001, "kl": 0.094482421875, "learning_rate": 9.76109215017065e-06, "loss": 0.0133, "num_tokens": 63032717.0, "reward": 2.04296875, "reward_std": 0.029919598251581192, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1504.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 644.1015625, "completions/mean_terminated_length": 644.1015625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.04915934112827516, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3239891081322465, "kl": 0.0887451171875, "learning_rate": 9.795221843003415e-06, "loss": 0.007, "num_tokens": 63241079.0, "reward": 2.150390625, "reward_std": 0.19885092973709106, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.36746934056282043, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 531.375, "completions/mean_terminated_length": 531.375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.049330033284970556, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2961361405979324, "kl": 0.0985107421875, "learning_rate": 9.829351535836179e-06, "loss": 0.0069, "num_tokens": 63417095.0, "reward": 2.16796875, "reward_std": 0.1468954086303711, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.3745708465576172, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1100.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 543.94921875, "completions/mean_terminated_length": 543.94921875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.049500725441665956, "frac_reward_zero_std": 0.5, "grad_norm": 0.288018114944264, "kl": 0.1053466796875, "learning_rate": 9.863481228668942e-06, "loss": 0.009, "num_tokens": 63592298.0, "reward": 2.3046875, "reward_std": 0.21199080348014832, "rewards/accuracy_reward/mean": 0.3046875, "rewards/accuracy_reward/std": 0.4611765742301941, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1937.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 726.8046875, "completions/mean_terminated_length": 726.8046875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.049671417598361356, "frac_reward_zero_std": 0.6875, "grad_norm": 0.20108455042088658, "kl": 0.0865478515625, "learning_rate": 9.897610921501706e-06, "loss": -0.0021, "num_tokens": 63820280.0, "reward": 2.0546875, "reward_std": 0.1080445945262909, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22781464457511902, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1714.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 645.29296875, "completions/mean_terminated_length": 645.29296875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.049842109755056756, "frac_reward_zero_std": 0.5, "grad_norm": 0.20378156184859741, "kl": 0.0858154296875, "learning_rate": 9.931740614334472e-06, "loss": 0.0178, "num_tokens": 64023363.0, "reward": 2.34765625, "reward_std": 0.20598775148391724, "rewards/accuracy_reward/mean": 0.34765625, "rewards/accuracy_reward/std": 0.4771590530872345, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 694.75390625, "completions/mean_terminated_length": 694.75390625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.050012801911752156, "frac_reward_zero_std": 0.625, "grad_norm": 0.21950878454381773, "kl": 0.0977783203125, "learning_rate": 9.965870307167235e-06, "loss": 0.0114, "num_tokens": 64244484.0, "reward": 2.171875, "reward_std": 0.1594453752040863, "rewards/accuracy_reward/mean": 0.18333333730697632, "rewards/accuracy_reward/std": 0.38774824142456055, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1616.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 686.91796875, "completions/mean_terminated_length": 686.91796875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.050183494068447555, "frac_reward_zero_std": 0.625, "grad_norm": 0.19895466269128578, "kl": 0.087646484375, "learning_rate": 1e-05, "loss": 0.0275, "num_tokens": 64461823.0, "reward": 2.24609375, "reward_std": 0.15590627491474152, "rewards/accuracy_reward/mean": 0.24609375, "rewards/accuracy_reward/std": 0.43157756328582764, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1629.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 817.8671875, "completions/mean_terminated_length": 817.8671875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.050354186225142955, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1644537227875781, "kl": 0.0787353515625, "learning_rate": 1.0034129692832766e-05, "loss": -0.0002, "num_tokens": 64713021.0, "reward": 2.14453125, "reward_std": 0.14963586628437042, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35231640934944153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 764.5625, "completions/mean_terminated_length": 764.5625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.050524878381838355, "frac_reward_zero_std": 0.5625, "grad_norm": 0.18367584088409286, "kl": 0.080810546875, "learning_rate": 1.006825938566553e-05, "loss": 0.0094, "num_tokens": 64951309.0, "reward": 2.2177734375, "reward_std": 0.18196213245391846, "rewards/accuracy_reward/mean": 0.22265625, "rewards/accuracy_reward/std": 0.41684433817863464, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1952.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 854.49609375, "completions/mean_terminated_length": 852.2588500976562, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.050695570538533755, "frac_reward_zero_std": 0.625, "grad_norm": 6.207404707288353, "kl": 1.12158203125, "learning_rate": 1.0102389078498294e-05, "loss": 0.0427, "num_tokens": 65211772.0, "reward": 2.0966796875, "reward_std": 0.13802003860473633, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 893.15234375, "completions/mean_terminated_length": 870.1474609375, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.050866262695229154, "frac_reward_zero_std": 0.5625, "grad_norm": 0.19340991850641742, "kl": 0.0777587890625, "learning_rate": 1.013651877133106e-05, "loss": 0.0208, "num_tokens": 65482419.0, "reward": 2.123046875, "reward_std": 0.17887476086616516, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35231640934944153, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.0539139099419117, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1729.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 747.34375, "completions/mean_terminated_length": 747.34375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.051036954851924554, "frac_reward_zero_std": 0.6875, "grad_norm": 0.18462637650114, "kl": 0.0987548828125, "learning_rate": 1.0170648464163823e-05, "loss": 0.027, "num_tokens": 65711163.0, "reward": 2.14453125, "reward_std": 0.11799382418394089, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.36746934056282043, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03814799711108208, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 768.96875, "completions/mean_terminated_length": 763.9530029296875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.051207647008619954, "frac_reward_zero_std": 0.6875, "grad_norm": 0.18533453383193244, "kl": 0.09716796875, "learning_rate": 1.0204778156996589e-05, "loss": 0.0176, "num_tokens": 65951411.0, "reward": 2.1142578125, "reward_std": 0.13331109285354614, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3268752694129944, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 711.07421875, "completions/mean_terminated_length": 711.07421875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.051378339165315354, "frac_reward_zero_std": 0.5625, "grad_norm": 0.25469938894809946, "kl": 0.0906982421875, "learning_rate": 1.0238907849829352e-05, "loss": 0.0087, "num_tokens": 66175814.0, "reward": 2.1240234375, "reward_std": 0.1537952721118927, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1426.0, "completions/max_terminated_length": 1426.0, "completions/mean_length": 722.4375, "completions/mean_terminated_length": 722.4375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.051549031322010753, "frac_reward_zero_std": 0.75, "grad_norm": 0.1926304231759094, "kl": 0.1043701171875, "learning_rate": 1.0273037542662116e-05, "loss": 0.0064, "num_tokens": 66406038.0, "reward": 2.08203125, "reward_std": 0.07779236882925034, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2749498784542084, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1486.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 729.2265625, "completions/mean_terminated_length": 729.2265625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.05171972347870615, "frac_reward_zero_std": 0.5, "grad_norm": 0.23096569051770222, "kl": 0.0858154296875, "learning_rate": 1.0307167235494882e-05, "loss": 0.0023, "num_tokens": 66639984.0, "reward": 2.1875, "reward_std": 0.1928790956735611, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1442.0, "completions/max_terminated_length": 1442.0, "completions/mean_length": 754.296875, "completions/mean_terminated_length": 754.296875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.05189041563540155, "frac_reward_zero_std": 0.8125, "grad_norm": 0.1781909108048228, "kl": 0.0816650390625, "learning_rate": 1.0341296928327647e-05, "loss": 0.0147, "num_tokens": 66875788.0, "reward": 2.15234375, "reward_std": 0.0924195945262909, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.3600577116012573, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1405.0, "completions/max_terminated_length": 1405.0, "completions/mean_length": 743.87890625, "completions/mean_terminated_length": 743.87890625, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.05206110779209695, "frac_reward_zero_std": 0.5, "grad_norm": 0.22112518285484573, "kl": 0.07763671875, "learning_rate": 1.0375426621160409e-05, "loss": 0.0069, "num_tokens": 67104541.0, "reward": 2.1953125, "reward_std": 0.2180875539779663, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 763.39453125, "completions/mean_terminated_length": 763.39453125, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.05223179994879235, "frac_reward_zero_std": 0.6875, "grad_norm": 0.17243167865218326, "kl": 0.07958984375, "learning_rate": 1.0409556313993175e-05, "loss": 0.0131, "num_tokens": 67341842.0, "reward": 2.0927734375, "reward_std": 0.12560710310935974, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29743078351020813, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1227.0, "completions/max_terminated_length": 1227.0, "completions/mean_length": 677.44921875, "completions/mean_terminated_length": 677.44921875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.05240249210548775, "frac_reward_zero_std": 0.75, "grad_norm": 0.18405953546287981, "kl": 0.0963134765625, "learning_rate": 1.044368600682594e-05, "loss": 0.0101, "num_tokens": 67554053.0, "reward": 2.1201171875, "reward_std": 0.1069202646613121, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1383.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 674.27734375, "completions/mean_terminated_length": 674.27734375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.05257318426218315, "frac_reward_zero_std": 0.625, "grad_norm": 0.1898315061150711, "kl": 0.0843505859375, "learning_rate": 1.0477815699658704e-05, "loss": 0.0136, "num_tokens": 67767676.0, "reward": 2.16796875, "reward_std": 0.1544148027896881, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.3745708465576172, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 674.1640625, "completions/mean_terminated_length": 674.1640625, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.05274387641887855, "frac_reward_zero_std": 0.5, "grad_norm": 0.2243722481430077, "kl": 0.0941162109375, "learning_rate": 1.051194539249147e-05, "loss": 0.0108, "num_tokens": 67980390.0, "reward": 2.1787109375, "reward_std": 0.21658019721508026, "rewards/accuracy_reward/mean": 0.19583334028720856, "rewards/accuracy_reward/std": 0.397670179605484, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1366.0, "completions/max_terminated_length": 1366.0, "completions/mean_length": 606.53125, "completions/mean_terminated_length": 606.53125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.05291456857557395, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2754074368114795, "kl": 0.1007080078125, "learning_rate": 1.0546075085324231e-05, "loss": -0.0057, "num_tokens": 68168494.0, "reward": 2.1142578125, "reward_std": 0.13320611417293549, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1150.0, "completions/max_terminated_length": 1150.0, "completions/mean_length": 615.640625, "completions/mean_terminated_length": 615.0784912109375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.05308526073226935, "frac_reward_zero_std": 0.5625, "grad_norm": 2.5791697861663496, "kl": 0.1260986328125, "learning_rate": 1.0580204778156997e-05, "loss": 0.0113, "num_tokens": 68363714.0, "reward": 2.1591796875, "reward_std": 0.16907444596290588, "rewards/accuracy_reward/mean": 0.17578125, "rewards/accuracy_reward/std": 0.3813795745372772, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05169277638196945, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1615.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 717.6796875, "completions/mean_terminated_length": 717.6796875, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.05325595288896475, "frac_reward_zero_std": 0.5625, "grad_norm": 0.18881756491068008, "kl": 0.08837890625, "learning_rate": 1.0614334470989762e-05, "loss": 0.0203, "num_tokens": 68588448.0, "reward": 2.08984375, "reward_std": 0.14711952209472656, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1265.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 647.63671875, "completions/mean_terminated_length": 647.63671875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.05342664504566015, "frac_reward_zero_std": 0.8125, "grad_norm": 0.19232752382654744, "kl": 0.08447265625, "learning_rate": 1.0648464163822528e-05, "loss": 0.0157, "num_tokens": 68789299.0, "reward": 2.203125, "reward_std": 0.08487267792224884, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40311288833618164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1143.0, "completions/max_terminated_length": 1143.0, "completions/mean_length": 554.890625, "completions/mean_terminated_length": 554.890625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.05359733720235555, "frac_reward_zero_std": 0.625, "grad_norm": 0.297625955011031, "kl": 0.10546875, "learning_rate": 1.068259385665529e-05, "loss": 0.0028, "num_tokens": 68971767.0, "reward": 2.1875, "reward_std": 0.13149452209472656, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1200.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 671.62109375, "completions/mean_terminated_length": 671.62109375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.05376802935905095, "frac_reward_zero_std": 0.75, "grad_norm": 0.14829291018604504, "kl": 0.078857421875, "learning_rate": 1.0716723549488055e-05, "loss": 0.0065, "num_tokens": 69189158.0, "reward": 2.08984375, "reward_std": 0.10266819596290588, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1272.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 605.4921875, "completions/mean_terminated_length": 605.4921875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.05393872151574635, "frac_reward_zero_std": 0.375, "grad_norm": 0.31632913646689936, "kl": 0.0828857421875, "learning_rate": 1.075085324232082e-05, "loss": 0.0036, "num_tokens": 69384804.0, "reward": 2.15234375, "reward_std": 0.2656567096710205, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.3600577116012573, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1098.0, "completions/max_terminated_length": 1098.0, "completions/mean_length": 590.31640625, "completions/mean_terminated_length": 590.31640625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.05410941367244175, "frac_reward_zero_std": 0.625, "grad_norm": 0.2497922442157818, "kl": 0.106201171875, "learning_rate": 1.0784982935153585e-05, "loss": 0.007, "num_tokens": 69583893.0, "reward": 2.140625, "reward_std": 0.16503483057022095, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3483152687549591, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1337.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 665.17578125, "completions/mean_terminated_length": 665.17578125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.05428010582913715, "frac_reward_zero_std": 0.8125, "grad_norm": 0.14029235760800468, "kl": 0.0845947265625, "learning_rate": 1.081911262798635e-05, "loss": 0.0023, "num_tokens": 69792066.0, "reward": 2.0224609375, "reward_std": 0.07079866528511047, "rewards/accuracy_reward/mean": 0.02916666679084301, "rewards/accuracy_reward/std": 0.16862517595291138, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1291.0, "completions/max_terminated_length": 1291.0, "completions/mean_length": 654.31640625, "completions/mean_terminated_length": 651.8196411132812, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.05445079798583255, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7708542324613357, "kl": 0.0821533203125, "learning_rate": 1.0853242320819112e-05, "loss": 0.0069, "num_tokens": 69996851.0, "reward": 2.1474609375, "reward_std": 0.17081907391548157, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.3710577189922333, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05169277638196945, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1405.0, "completions/max_terminated_length": 1405.0, "completions/mean_length": 707.76953125, "completions/mean_terminated_length": 707.76953125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.05462149014252795, "frac_reward_zero_std": 0.625, "grad_norm": 0.1865369558335094, "kl": 0.0775146484375, "learning_rate": 1.0887372013651878e-05, "loss": -0.0096, "num_tokens": 70223304.0, "reward": 2.171875, "reward_std": 0.14523237943649292, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1680.0, "completions/max_terminated_length": 1680.0, "completions/mean_length": 712.57421875, "completions/mean_terminated_length": 712.57421875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.05479218229922335, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2722934648471553, "kl": 0.0810546875, "learning_rate": 1.0921501706484643e-05, "loss": 0.0168, "num_tokens": 70450987.0, "reward": 2.23046875, "reward_std": 0.16926807165145874, "rewards/accuracy_reward/mean": 0.23046875, "rewards/accuracy_reward/std": 0.4219578504562378, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1465.0, "completions/max_terminated_length": 1465.0, "completions/mean_length": 713.78125, "completions/mean_terminated_length": 713.78125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.05496287445591875, "frac_reward_zero_std": 0.6875, "grad_norm": 0.17750276758863584, "kl": 0.0899658203125, "learning_rate": 1.0955631399317409e-05, "loss": 0.0017, "num_tokens": 70675859.0, "reward": 2.1162109375, "reward_std": 0.14804594218730927, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3268752694129944, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 732.80859375, "completions/mean_terminated_length": 732.80859375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.05513356661261415, "frac_reward_zero_std": 0.75, "grad_norm": 0.20304260029772347, "kl": 0.091796875, "learning_rate": 1.098976109215017e-05, "loss": 0.002, "num_tokens": 70904882.0, "reward": 2.04296875, "reward_std": 0.08461953699588776, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1483.0, "completions/max_terminated_length": 1483.0, "completions/mean_length": 756.23046875, "completions/mean_terminated_length": 756.23046875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.05530425876930955, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1596021089462334, "kl": 0.06982421875, "learning_rate": 1.1023890784982936e-05, "loss": 0.013, "num_tokens": 71139357.0, "reward": 2.1875, "reward_std": 0.1281953752040863, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1996.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 822.859375, "completions/mean_terminated_length": 820.925537109375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.05547495092600495, "frac_reward_zero_std": 0.625, "grad_norm": 0.5565958298111463, "kl": 0.0823974609375, "learning_rate": 1.1058020477815702e-05, "loss": -0.0085, "num_tokens": 71395337.0, "reward": 2.1611328125, "reward_std": 0.1141062006354332, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1648.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 671.6875, "completions/mean_terminated_length": 671.6875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.05564564308270035, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1763241947371778, "kl": 0.0904541015625, "learning_rate": 1.1092150170648465e-05, "loss": 0.0084, "num_tokens": 71604121.0, "reward": 2.16796875, "reward_std": 0.18463993072509766, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.3745708465576172, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 852.0703125, "completions/mean_terminated_length": 842.653564453125, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.05581633523939575, "frac_reward_zero_std": 0.625, "grad_norm": 0.16486277657056256, "kl": 0.0767822265625, "learning_rate": 1.1126279863481229e-05, "loss": 0.0269, "num_tokens": 71863579.0, "reward": 2.158203125, "reward_std": 0.16167478263378143, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.0661611557006836, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1666.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 729.984375, "completions/mean_terminated_length": 729.984375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.05598702739609115, "frac_reward_zero_std": 0.625, "grad_norm": 0.19825411164020545, "kl": 0.0897216796875, "learning_rate": 1.1160409556313993e-05, "loss": 0.0178, "num_tokens": 72092695.0, "reward": 2.166015625, "reward_std": 0.1627320945262909, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1662.0, "completions/max_terminated_length": 1662.0, "completions/mean_length": 834.18359375, "completions/mean_terminated_length": 834.18359375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.05615771955278655, "frac_reward_zero_std": 0.625, "grad_norm": 0.19460260260759477, "kl": 0.0828857421875, "learning_rate": 1.1194539249146758e-05, "loss": 0.0179, "num_tokens": 72349478.0, "reward": 2.1015625, "reward_std": 0.11612267047166824, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3077581524848938, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.0625, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1716.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 826.2421875, "completions/mean_terminated_length": 826.2421875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.05632841170948195, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1502523277802345, "kl": 0.080322265625, "learning_rate": 1.1228668941979524e-05, "loss": -0.007, "num_tokens": 72599316.0, "reward": 2.0888671875, "reward_std": 0.1286691129207611, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.2920515835285187, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1612.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 786.7109375, "completions/mean_terminated_length": 785.7216186523438, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.05649910386617735, "frac_reward_zero_std": 0.5625, "grad_norm": 252.21843444670046, "kl": 36.3114013671875, "learning_rate": 1.126279863481229e-05, "loss": 1.4764, "num_tokens": 72844170.0, "reward": 2.1328125, "reward_std": 0.18079319596290588, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1588.0, "completions/mean_length": 650.3828125, "completions/mean_terminated_length": 639.3779296875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.05666979602287275, "frac_reward_zero_std": 0.5, "grad_norm": 0.25790116245724565, "kl": 0.081787109375, "learning_rate": 1.1296928327645051e-05, "loss": 0.0355, "num_tokens": 73049916.0, "reward": 2.1630859375, "reward_std": 0.22139763832092285, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.38467901945114136, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05169277638196945, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1855.0, "completions/mean_length": 746.74609375, "completions/mean_terminated_length": 741.6431884765625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.05684048817956815, "frac_reward_zero_std": 0.625, "grad_norm": 0.17732369903215142, "kl": 0.08203125, "learning_rate": 1.1331058020477817e-05, "loss": 0.0236, "num_tokens": 73279771.0, "reward": 2.1611328125, "reward_std": 0.19531844556331635, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.3745708465576172, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 752.75, "completions/mean_terminated_length": 752.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.057011180336263546, "frac_reward_zero_std": 0.875, "grad_norm": 0.09643765767708176, "kl": 0.0811767578125, "learning_rate": 1.1365187713310582e-05, "loss": 0.0103, "num_tokens": 73510027.0, "reward": 2.19140625, "reward_std": 0.06116959825158119, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1844.0, "completions/max_terminated_length": 1844.0, "completions/mean_length": 703.1640625, "completions/mean_terminated_length": 703.1640625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.057181872492958946, "frac_reward_zero_std": 0.5625, "grad_norm": 0.22362770591068074, "kl": 0.10302734375, "learning_rate": 1.1399317406143346e-05, "loss": 0.0046, "num_tokens": 73730389.0, "reward": 2.11328125, "reward_std": 0.1727961003780365, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1500.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 746.03515625, "completions/mean_terminated_length": 746.03515625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.057352564649654346, "frac_reward_zero_std": 0.625, "grad_norm": 0.20047538855821925, "kl": 0.089111328125, "learning_rate": 1.143344709897611e-05, "loss": 0.0195, "num_tokens": 73974094.0, "reward": 2.078125, "reward_std": 0.13401086628437042, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 561.8203125, "completions/mean_terminated_length": 561.8203125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.057523256806349746, "frac_reward_zero_std": 0.4375, "grad_norm": 0.2877416336487494, "kl": 0.1016845703125, "learning_rate": 1.1467576791808874e-05, "loss": 0.0346, "num_tokens": 74155456.0, "reward": 2.1787109375, "reward_std": 0.21727436780929565, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.387910932302475, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1308.0, "completions/max_terminated_length": 1308.0, "completions/mean_length": 623.55078125, "completions/mean_terminated_length": 623.55078125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.057693948963045145, "frac_reward_zero_std": 0.4375, "grad_norm": 0.2821205591207904, "kl": 0.092041015625, "learning_rate": 1.1501706484641639e-05, "loss": 0.0016, "num_tokens": 74356541.0, "reward": 2.1904296875, "reward_std": 0.24023441970348358, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 700.77734375, "completions/mean_terminated_length": 698.8745727539062, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.057864641119740545, "frac_reward_zero_std": 0.8125, "grad_norm": 58058.71068897006, "kl": 2496.0701904296875, "learning_rate": 1.1535836177474405e-05, "loss": 99.7633, "num_tokens": 74578084.0, "reward": 2.05078125, "reward_std": 0.07756631821393967, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21998079121112823, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1713.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 762.4765625, "completions/mean_terminated_length": 762.4765625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.058035333276435945, "frac_reward_zero_std": 0.75, "grad_norm": 0.13746043708379002, "kl": 0.0992431640625, "learning_rate": 1.1569965870307167e-05, "loss": 0.0084, "num_tokens": 74815646.0, "reward": 2.0859375, "reward_std": 0.10520036518573761, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1419.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 720.99609375, "completions/mean_terminated_length": 720.1882934570312, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.058206025433131345, "frac_reward_zero_std": 0.3125, "grad_norm": 624.0307639834593, "kl": 48.560546875, "learning_rate": 1.1604095563139932e-05, "loss": 1.9369, "num_tokens": 75042989.0, "reward": 2.29296875, "reward_std": 0.2537429928779602, "rewards/accuracy_reward/mean": 0.29296875, "rewards/accuracy_reward/std": 0.45601576566696167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 697.12890625, "completions/mean_terminated_length": 697.12890625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.058376717589826745, "frac_reward_zero_std": 0.75, "grad_norm": 0.16808957548907957, "kl": 0.092041015625, "learning_rate": 1.1638225255972698e-05, "loss": 0.0009, "num_tokens": 75261822.0, "reward": 2.1015625, "reward_std": 0.08846627175807953, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1440.0, "completions/max_terminated_length": 1440.0, "completions/mean_length": 678.2734375, "completions/mean_terminated_length": 678.2734375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.058547409746522144, "frac_reward_zero_std": 0.625, "grad_norm": 0.2496919846645204, "kl": 0.099609375, "learning_rate": 1.1672354948805463e-05, "loss": 0.0124, "num_tokens": 75479412.0, "reward": 2.078125, "reward_std": 0.1370203047990799, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1538.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 724.1015625, "completions/mean_terminated_length": 724.1015625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.058718101903217544, "frac_reward_zero_std": 0.6875, "grad_norm": 0.21896908146206706, "kl": 0.0899658203125, "learning_rate": 1.1706484641638227e-05, "loss": 0.0148, "num_tokens": 75712094.0, "reward": 2.19921875, "reward_std": 0.13072282075881958, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1926.0, "completions/mean_length": 820.38671875, "completions/mean_terminated_length": 810.720458984375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.058888794059912944, "frac_reward_zero_std": 0.625, "grad_norm": 0.2202803112379985, "kl": 0.0899658203125, "learning_rate": 1.174061433447099e-05, "loss": 0.0156, "num_tokens": 75964545.0, "reward": 2.162109375, "reward_std": 0.1417873203754425, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 2041.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 791.54296875, "completions/mean_terminated_length": 791.54296875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.059059486216608344, "frac_reward_zero_std": 0.6875, "grad_norm": 0.15408578349008956, "kl": 0.079345703125, "learning_rate": 1.1774744027303754e-05, "loss": 0.0141, "num_tokens": 76206092.0, "reward": 2.13671875, "reward_std": 0.13204212486743927, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.34422317147254944, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1846.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 722.6875, "completions/mean_terminated_length": 722.6875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.05923017837330374, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2495537442187399, "kl": 0.08544921875, "learning_rate": 1.180887372013652e-05, "loss": 0.0087, "num_tokens": 76432636.0, "reward": 2.203125, "reward_std": 0.1879279911518097, "rewards/accuracy_reward/mean": 0.21666666865348816, "rewards/accuracy_reward/std": 0.4128345549106598, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 746.88671875, "completions/mean_terminated_length": 720.9681396484375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.05940087052999914, "frac_reward_zero_std": 0.5, "grad_norm": 0.19810713425405563, "kl": 0.092529296875, "learning_rate": 1.1843003412969285e-05, "loss": 0.019, "num_tokens": 76667951.0, "reward": 2.1943359375, "reward_std": 0.21082806587219238, "rewards/accuracy_reward/mean": 0.22265625, "rewards/accuracy_reward/std": 0.41684433817863464, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.07120048254728317, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1411.0, "completions/max_terminated_length": 1411.0, "completions/mean_length": 672.94140625, "completions/mean_terminated_length": 672.94140625, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.05957156268669454, "frac_reward_zero_std": 0.625, "grad_norm": 0.19756594545819195, "kl": 0.0914306640625, "learning_rate": 1.1877133105802047e-05, "loss": -0.0025, "num_tokens": 76879296.0, "reward": 2.119140625, "reward_std": 0.16356796026229858, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1707.0, "completions/max_terminated_length": 1707.0, "completions/mean_length": 748.32421875, "completions/mean_terminated_length": 748.32421875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.05974225484338994, "frac_reward_zero_std": 0.6875, "grad_norm": 0.19606478609155906, "kl": 0.1016845703125, "learning_rate": 1.1911262798634813e-05, "loss": 0.0221, "num_tokens": 77115171.0, "reward": 2.1953125, "reward_std": 0.1271837055683136, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1414.0, "completions/max_terminated_length": 1414.0, "completions/mean_length": 698.12109375, "completions/mean_terminated_length": 698.12109375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.05991294700008535, "frac_reward_zero_std": 0.75, "grad_norm": 0.14265835927675888, "kl": 0.1004638671875, "learning_rate": 1.1945392491467578e-05, "loss": 0.0022, "num_tokens": 77339970.0, "reward": 2.15625, "reward_std": 0.11123998463153839, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3734568655490875, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 665.10546875, "completions/mean_terminated_length": 665.10546875, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.06008363915678075, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3218827838265877, "kl": 0.1307373046875, "learning_rate": 1.1979522184300342e-05, "loss": 0.0192, "num_tokens": 77549645.0, "reward": 2.1904296875, "reward_std": 0.22623935341835022, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1394.0, "completions/max_terminated_length": 1394.0, "completions/mean_length": 592.6328125, "completions/mean_terminated_length": 592.6328125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.06025433131347615, "frac_reward_zero_std": 0.75, "grad_norm": 0.16468757076796733, "kl": 0.10791015625, "learning_rate": 1.2013651877133108e-05, "loss": 0.007, "num_tokens": 77742047.0, "reward": 2.18359375, "reward_std": 0.08627147972583771, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.387910932302475, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1375.0, "completions/max_terminated_length": 1375.0, "completions/mean_length": 701.5625, "completions/mean_terminated_length": 699.3255615234375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.06042502347017155, "frac_reward_zero_std": 0.5625, "grad_norm": 0.21551709552736825, "kl": 0.1077880859375, "learning_rate": 1.2047781569965871e-05, "loss": 0.0109, "num_tokens": 77964063.0, "reward": 2.2216796875, "reward_std": 0.18251782655715942, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.41942715644836426, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1689.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 690.265625, "completions/mean_terminated_length": 690.265625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.06059571562686695, "frac_reward_zero_std": 0.6875, "grad_norm": 0.18634930157354926, "kl": 0.098388671875, "learning_rate": 1.2081911262798635e-05, "loss": 0.0078, "num_tokens": 78184387.0, "reward": 2.0380859375, "reward_std": 0.10966061055660248, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1517.0, "completions/mean_length": 602.9375, "completions/mean_terminated_length": 597.2706298828125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.06076640778356235, "frac_reward_zero_std": 0.6875, "grad_norm": 0.18449711456277187, "kl": 0.1131591796875, "learning_rate": 1.21160409556314e-05, "loss": 0.0226, "num_tokens": 78379891.0, "reward": 2.2158203125, "reward_std": 0.11361522972583771, "rewards/accuracy_reward/mean": 0.22265625, "rewards/accuracy_reward/std": 0.41684433817863464, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1216.0, "completions/max_terminated_length": 1216.0, "completions/mean_length": 637.03125, "completions/mean_terminated_length": 637.03125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.06093709994025775, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2220976236554561, "kl": 0.108642578125, "learning_rate": 1.2150170648464166e-05, "loss": 0.0028, "num_tokens": 78585083.0, "reward": 2.046875, "reward_std": 0.15088102221488953, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03814799711108208, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1694.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 616.28125, "completions/mean_terminated_length": 616.28125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.06110779209695315, "frac_reward_zero_std": 0.625, "grad_norm": 0.22640601632898946, "kl": 0.0948486328125, "learning_rate": 1.2184300341296928e-05, "loss": 0.0025, "num_tokens": 78780483.0, "reward": 2.1357421875, "reward_std": 0.13621266186237335, "rewards/accuracy_reward/mean": 0.15000000596046448, "rewards/accuracy_reward/std": 0.3578176498413086, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1309.0, "completions/max_terminated_length": 1309.0, "completions/mean_length": 598.6484375, "completions/mean_terminated_length": 598.6484375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.06127848425364855, "frac_reward_zero_std": 0.5625, "grad_norm": 0.21259679510413743, "kl": 0.1014404296875, "learning_rate": 1.2218430034129694e-05, "loss": 0.0071, "num_tokens": 78981113.0, "reward": 2.193359375, "reward_std": 0.13930702209472656, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40311288833618164, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1233.0, "completions/max_terminated_length": 1233.0, "completions/mean_length": 599.734375, "completions/mean_terminated_length": 599.98828125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.06144917641034395, "frac_reward_zero_std": 0.75, "grad_norm": 0.2120445168352927, "kl": 0.1212158203125, "learning_rate": 1.2252559726962459e-05, "loss": 0.005, "num_tokens": 79181989.0, "reward": 2.025390625, "reward_std": 0.10069897770881653, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1334.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 578.07421875, "completions/mean_terminated_length": 578.07421875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.06161986856703935, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2010045858563395, "kl": 0.1202392578125, "learning_rate": 1.2286689419795223e-05, "loss": 0.0006, "num_tokens": 79375592.0, "reward": 2.1220703125, "reward_std": 0.13136345148086548, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03488371521234512, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1026.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 500.9140625, "completions/mean_terminated_length": 500.9140625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.06179056072373475, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2315356574366394, "kl": 0.114990234375, "learning_rate": 1.2320819112627987e-05, "loss": 0.0162, "num_tokens": 79545490.0, "reward": 2.2060546875, "reward_std": 0.17317432165145874, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4087733030319214, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1592.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 646.65234375, "completions/mean_terminated_length": 646.65234375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.06196125288043015, "frac_reward_zero_std": 0.625, "grad_norm": 0.22421556481998886, "kl": 0.11767578125, "learning_rate": 1.2354948805460752e-05, "loss": 0.003, "num_tokens": 79752025.0, "reward": 2.0869140625, "reward_std": 0.15719670057296753, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 595.03125, "completions/mean_terminated_length": 594.3843383789062, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.06213194503712555, "frac_reward_zero_std": 0.8125, "grad_norm": 5.990694824677285, "kl": 0.6802978515625, "learning_rate": 1.2389078498293516e-05, "loss": 0.0384, "num_tokens": 79947137.0, "reward": 2.0546875, "reward_std": 0.05259781330823898, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1086.0, "completions/max_terminated_length": 1086.0, "completions/mean_length": 573.421875, "completions/mean_terminated_length": 573.421875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.062302637193820946, "frac_reward_zero_std": 0.8125, "grad_norm": 0.17504377939237323, "kl": 0.125244140625, "learning_rate": 1.2423208191126281e-05, "loss": 0.0058, "num_tokens": 80137085.0, "reward": 1.9853515625, "reward_std": 0.05859375, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 1212.0, "completions/max_terminated_length": 1212.0, "completions/mean_length": 627.37109375, "completions/mean_terminated_length": 624.5494384765625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.062473329350516346, "frac_reward_zero_std": 0.1875, "grad_norm": 0.6607895909652702, "kl": 0.140869140625, "learning_rate": 1.2457337883959047e-05, "loss": 0.0228, "num_tokens": 80336524.0, "reward": 2.0224609375, "reward_std": 0.4142237901687622, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 0.8828125, "rewards/format_reward/std": 0.3222736418247223, "rewards/tag_count_reward/mean": 0.9677734375, "rewards/tag_count_reward/std": 0.09490203112363815, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1342.0, "completions/max_terminated_length": 1342.0, "completions/mean_length": 719.625, "completions/mean_terminated_length": 719.0314331054688, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.06264402150721174, "frac_reward_zero_std": 0.75, "grad_norm": 0.21336240183338442, "kl": 0.1043701171875, "learning_rate": 1.2491467576791809e-05, "loss": 0.0082, "num_tokens": 80563276.0, "reward": 2.078125, "reward_std": 0.09859732538461685, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1265.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 605.19921875, "completions/mean_terminated_length": 605.19921875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.06281471366390715, "frac_reward_zero_std": 0.6875, "grad_norm": 0.1844636930981008, "kl": 0.1182861328125, "learning_rate": 1.2525597269624574e-05, "loss": 0.0053, "num_tokens": 80757295.0, "reward": 2.1953125, "reward_std": 0.12159234285354614, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 1240.0, "completions/max_terminated_length": 1240.0, "completions/mean_length": 623.140625, "completions/mean_terminated_length": 622.6220703125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.06298540582060254, "frac_reward_zero_std": 0.6875, "grad_norm": 5.1504928196795925, "kl": 0.609375, "learning_rate": 1.255972696245734e-05, "loss": 0.0275, "num_tokens": 80953651.0, "reward": 2.125, "reward_std": 0.10189647972583771, "rewards/accuracy_reward/mean": 0.13333334028720856, "rewards/accuracy_reward/std": 0.34064507484436035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.90625, "completions/max_length": 1689.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 658.921875, "completions/mean_terminated_length": 657.4743041992188, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.06315609797729795, "frac_reward_zero_std": 0.75, "grad_norm": 0.2876377972663552, "kl": 0.14599609375, "learning_rate": 1.2593856655290104e-05, "loss": 0.0144, "num_tokens": 81172815.0, "reward": 2.1240234375, "reward_std": 0.10415078699588776, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 1286.0, "completions/max_terminated_length": 1286.0, "completions/mean_length": 646.71484375, "completions/mean_terminated_length": 643.6299438476562, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.06332679013399334, "frac_reward_zero_std": 0.625, "grad_norm": 0.2080275936019025, "kl": 0.145751953125, "learning_rate": 1.2627986348122867e-05, "loss": 0.0054, "num_tokens": 81378310.0, "reward": 2.1904296875, "reward_std": 0.16486260294914246, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 1298.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 675.69140625, "completions/mean_terminated_length": 671.909423828125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.06349748229068874, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3551979481708926, "kl": 0.1380615234375, "learning_rate": 1.2662116040955633e-05, "loss": 0.0101, "num_tokens": 81598679.0, "reward": 2.15625, "reward_std": 0.18056906759738922, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 1418.0, "completions/max_terminated_length": 1418.0, "completions/mean_length": 698.68359375, "completions/mean_terminated_length": 690.7848510742188, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.06366817444738414, "frac_reward_zero_std": 0.875, "grad_norm": 0.400552008904712, "kl": 0.1875, "learning_rate": 1.2696245733788397e-05, "loss": 0.0137, "num_tokens": 81816070.0, "reward": 2.08984375, "reward_std": 0.046875, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.6875, "completions/max_length": 1253.0, "completions/max_terminated_length": 1253.0, "completions/mean_length": 632.17578125, "completions/mean_terminated_length": 620.3414306640625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.06383886660407954, "frac_reward_zero_std": 0.625, "grad_norm": 0.2557344158616931, "kl": 0.257080078125, "learning_rate": 1.2730375426621162e-05, "loss": 0.0407, "num_tokens": 82019955.0, "reward": 2.2109375, "reward_std": 0.15900647640228271, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4087733030319214, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.9375, "completions/max_length": 1360.0, "completions/max_terminated_length": 1360.0, "completions/mean_length": 715.765625, "completions/mean_terminated_length": 712.0472412109375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.06400955876077494, "frac_reward_zero_std": 0.625, "grad_norm": 0.21408422002292987, "kl": 0.147216796875, "learning_rate": 1.2764505119453924e-05, "loss": 0.0196, "num_tokens": 82242151.0, "reward": 2.09765625, "reward_std": 0.12333697080612183, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29743078351020813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.875, "completions/max_length": 1335.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 764.98046875, "completions/mean_terminated_length": 758.0635375976562, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.06418025091747034, "frac_reward_zero_std": 0.75, "grad_norm": 0.305373969280133, "kl": 0.16064453125, "learning_rate": 1.279863481228669e-05, "loss": 0.0058, "num_tokens": 82482514.0, "reward": 2.0703125, "reward_std": 0.10024453699588776, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1161.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 629.50390625, "completions/mean_terminated_length": 628.7686767578125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.06435094307416574, "frac_reward_zero_std": 0.8125, "grad_norm": 0.14754834860699617, "kl": 0.1160888671875, "learning_rate": 1.2832764505119455e-05, "loss": 0.0046, "num_tokens": 82682307.0, "reward": 2.0546875, "reward_std": 0.06689241528511047, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22781464457511902, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1153.0, "completions/max_terminated_length": 1153.0, "completions/mean_length": 607.3046875, "completions/mean_terminated_length": 605.7568969726562, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.06452163523086114, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2180477826196865, "kl": 0.11962890625, "learning_rate": 1.286689419795222e-05, "loss": 0.0093, "num_tokens": 82881297.0, "reward": 2.060546875, "reward_std": 0.16389039158821106, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2561737895011902, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1413.0, "completions/max_terminated_length": 1413.0, "completions/mean_length": 778.75, "completions/mean_terminated_length": 777.10986328125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.06469232738755654, "frac_reward_zero_std": 0.625, "grad_norm": 2.9592410768580053, "kl": 0.1177978515625, "learning_rate": 1.2901023890784984e-05, "loss": -0.0018, "num_tokens": 83116817.0, "reward": 2.0556640625, "reward_std": 0.14313378930091858, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1301.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 589.515625, "completions/mean_terminated_length": 588.933349609375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.06486301954425194, "frac_reward_zero_std": 0.5, "grad_norm": 0.3466599023551739, "kl": 0.12841796875, "learning_rate": 1.2935153583617748e-05, "loss": 0.012, "num_tokens": 83309685.0, "reward": 2.1953125, "reward_std": 0.208757221698761, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -7.0, "completions/max_length": 1292.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 614.1171875, "completions/mean_terminated_length": 614.1171875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.06503371170094734, "frac_reward_zero_std": 0.5, "grad_norm": 0.28269929002280114, "kl": 0.118896484375, "learning_rate": 1.2969283276450513e-05, "loss": 0.0208, "num_tokens": 83510931.0, "reward": 2.203125, "reward_std": 0.19582012295722961, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40311288833618164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1361.0, "completions/max_terminated_length": 1361.0, "completions/mean_length": 698.3984375, "completions/mean_terminated_length": 696.6431884765625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.06520440385764274, "frac_reward_zero_std": 0.375, "grad_norm": 1.0943515524051959, "kl": 0.1148681640625, "learning_rate": 1.3003412969283277e-05, "loss": 0.0172, "num_tokens": 83732921.0, "reward": 2.1298828125, "reward_std": 0.2504074275493622, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35231640934944153, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1386.0, "completions/max_terminated_length": 1386.0, "completions/mean_length": 745.015625, "completions/mean_terminated_length": 742.98828125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.06537509601433814, "frac_reward_zero_std": 0.625, "grad_norm": 0.541776513013078, "kl": 0.112548828125, "learning_rate": 1.3037542662116043e-05, "loss": -0.0064, "num_tokens": 83965421.0, "reward": 2.1025390625, "reward_std": 0.18384021520614624, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03488371521234512, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1265.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 700.87890625, "completions/mean_terminated_length": 700.0157470703125, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.06554578817103354, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2558268032046653, "kl": 0.130615234375, "learning_rate": 1.3071672354948805e-05, "loss": 0.011, "num_tokens": 84184830.0, "reward": 2.01953125, "reward_std": 0.11808021366596222, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04935242608189583, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.40625, "completions/max_length": 1563.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 808.24609375, "completions/mean_terminated_length": 650.0693359375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.06571648032772893, "frac_reward_zero_std": 0.25, "grad_norm": 13.48473127210206, "kl": 2.05078125, "learning_rate": 1.310580204778157e-05, "loss": 0.0768, "num_tokens": 84439357.0, "reward": 2.08984375, "reward_std": 0.22624936699867249, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3268752694129944, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.10266068577766418, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 1469.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 1087.3359375, "completions/mean_terminated_length": 847.7999877929688, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.06588717248442434, "frac_reward_zero_std": 0.1875, "grad_norm": 39.05861460067673, "kl": 5.03125, "learning_rate": 1.3139931740614336e-05, "loss": 0.1902, "num_tokens": 84759475.0, "reward": 2.0244140625, "reward_std": 0.2954064607620239, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9189453125, "rewards/tag_count_reward/std": 0.26201045513153076, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1415.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 967.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 694.0, "completions/min_terminated_length": 0.0, "epoch": 0.06605786464111973, "frac_reward_zero_std": 0.0625, "grad_norm": 13.538241435169844, "kl": 2.87890625, "learning_rate": 1.3174061433447101e-05, "loss": 0.1154, "num_tokens": 85050707.0, "reward": 1.955078125, "reward_std": 0.37630054354667664, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.876953125, "rewards/tag_count_reward/std": 0.30356717109680176, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1087.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 830.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 469.0, "completions/min_terminated_length": 0.0, "epoch": 0.06622855679781514, "frac_reward_zero_std": 0.0625, "grad_norm": 3.6174239464801916, "kl": 2.28515625, "learning_rate": 1.3208191126279865e-05, "loss": 0.0916, "num_tokens": 85303475.0, "reward": 2.0625, "reward_std": 0.220173180103302, "rewards/accuracy_reward/mean": 0.13333334028720856, "rewards/accuracy_reward/std": 0.34064507484436035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.19045868515968323, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1177.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 979.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 710.0, "completions/min_terminated_length": 0.0, "epoch": 0.06639924895451053, "frac_reward_zero_std": 0.75, "grad_norm": 1.458100685606578, "kl": 2.1484375, "learning_rate": 1.3242320819112629e-05, "loss": 0.0859, "num_tokens": 85595667.0, "reward": 2.041015625, "reward_std": 0.08263835310935974, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21998079121112823, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1226.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 895.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 672.0, "completions/min_terminated_length": 0.0, "epoch": 0.06656994111120594, "frac_reward_zero_std": 0.625, "grad_norm": 1.1049285221724028, "kl": 2.29296875, "learning_rate": 1.3276450511945394e-05, "loss": 0.0915, "num_tokens": 85862291.0, "reward": 2.18359375, "reward_std": 0.17006689310073853, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.387910932302475, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 924.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 711.0, "completions/min_terminated_length": 0.0, "epoch": 0.06674063326790133, "frac_reward_zero_std": 0.625, "grad_norm": 4.901915242135725, "kl": 2.34765625, "learning_rate": 1.3310580204778158e-05, "loss": 0.094, "num_tokens": 86143891.0, "reward": 2.078125, "reward_std": 0.13644562661647797, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04935242608189583, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1223.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1026.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 796.0, "completions/min_terminated_length": 0.0, "epoch": 0.06691132542459674, "frac_reward_zero_std": 0.625, "grad_norm": 2.6973000948755588, "kl": 2.23046875, "learning_rate": 1.3344709897610923e-05, "loss": 0.0893, "num_tokens": 86443427.0, "reward": 2.056640625, "reward_std": 0.14810487627983093, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1627.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1055.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 699.0, "completions/min_terminated_length": 0.0, "epoch": 0.06708201758129213, "frac_reward_zero_std": 0.375, "grad_norm": 0.9521194985975299, "kl": 2.08984375, "learning_rate": 1.3378839590443686e-05, "loss": 0.0836, "num_tokens": 86750739.0, "reward": 2.0888671875, "reward_std": 0.2647116780281067, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.05117155611515045, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1595.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1146.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 882.0, "completions/min_terminated_length": 0.0, "epoch": 0.06725270973798754, "frac_reward_zero_std": 0.625, "grad_norm": 2.0977470245271417, "kl": 2.04296875, "learning_rate": 1.3412969283276451e-05, "loss": 0.0818, "num_tokens": 87088019.0, "reward": 2.087890625, "reward_std": 0.15595689415931702, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29743078351020813, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1497.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1074.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 750.0, "completions/min_terminated_length": 0.0, "epoch": 0.06742340189468293, "frac_reward_zero_std": 0.8125, "grad_norm": 0.6432695263420216, "kl": 2.1484375, "learning_rate": 1.3447098976109216e-05, "loss": 0.0858, "num_tokens": 87405347.0, "reward": 2.0302734375, "reward_std": 0.06743110716342926, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1190.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 939.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 651.0, "completions/min_terminated_length": 0.0, "epoch": 0.06759409405137834, "frac_reward_zero_std": 0.0, "grad_norm": 3.6651377893966197, "kl": 2.25390625, "learning_rate": 1.3481228668941982e-05, "loss": 0.09, "num_tokens": 87680691.0, "reward": 1.662109375, "reward_std": 0.6332250237464905, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 0.57421875, "rewards/format_reward/std": 0.49542948603630066, "rewards/tag_count_reward/mean": 0.916015625, "rewards/tag_count_reward/std": 0.1183105930685997, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1280.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1018.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 795.0, "completions/min_terminated_length": 0.0, "epoch": 0.06776478620807373, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3313225375741593, "kl": 2.01171875, "learning_rate": 1.3515358361774744e-05, "loss": 0.0804, "num_tokens": 87977827.0, "reward": 2.19140625, "reward_std": 0.1708770990371704, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1523.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1088.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 637.0, "completions/min_terminated_length": 0.0, "epoch": 0.06793547836476914, "frac_reward_zero_std": 0.8125, "grad_norm": 1.8500737107434733, "kl": 2.1875, "learning_rate": 1.354948805460751e-05, "loss": 0.0874, "num_tokens": 88303027.0, "reward": 2.09765625, "reward_std": 0.06327171623706818, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29743078351020813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1261.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1132.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 962.0, "completions/min_terminated_length": 0.0, "epoch": 0.06810617052146453, "frac_reward_zero_std": 0.75, "grad_norm": 0.36533667813923226, "kl": 1.841796875, "learning_rate": 1.3583617747440275e-05, "loss": 0.0737, "num_tokens": 88627651.0, "reward": 2.1474609375, "reward_std": 0.09499544650316238, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.3600577116012573, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1719.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1154.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 655.0, "completions/min_terminated_length": 0.0, "epoch": 0.06827686267815994, "frac_reward_zero_std": 0.75, "grad_norm": 1.8693960021895002, "kl": 1.818359375, "learning_rate": 1.3617747440273039e-05, "loss": 0.0728, "num_tokens": 88961715.0, "reward": 2.009765625, "reward_std": 0.08263835310935974, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1305.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1066.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 696.0, "completions/min_terminated_length": 0.0, "epoch": 0.06844755483485535, "frac_reward_zero_std": 0.9375, "grad_norm": 1.4477693119952333, "kl": 1.875, "learning_rate": 1.3651877133105804e-05, "loss": 0.075, "num_tokens": 89275267.0, "reward": 2.01953125, "reward_std": 0.029919598251581192, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1064.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 663.0, "completions/min_terminated_length": 0.0, "epoch": 0.06861824699155074, "frac_reward_zero_std": 0.6875, "grad_norm": 1.241043356161663, "kl": 1.79296875, "learning_rate": 1.3686006825938566e-05, "loss": 0.0717, "num_tokens": 89590371.0, "reward": 2.1279296875, "reward_std": 0.11977578699588776, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1052.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 551.0, "completions/min_terminated_length": 0.0, "epoch": 0.06878893914824614, "frac_reward_zero_std": 0.6875, "grad_norm": 2.6564296742436975, "kl": 1.705078125, "learning_rate": 1.3720136518771332e-05, "loss": 0.0683, "num_tokens": 89903619.0, "reward": 2.1513671875, "reward_std": 0.13617250323295593, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1169.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 931.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 659.0, "completions/min_terminated_length": 0.0, "epoch": 0.06895963130494154, "frac_reward_zero_std": 0.4375, "grad_norm": 1.1996524510280422, "kl": 1.625, "learning_rate": 1.3754266211604097e-05, "loss": 0.065, "num_tokens": 90187987.0, "reward": 2.212890625, "reward_std": 0.23451192677021027, "rewards/accuracy_reward/mean": 0.22265625, "rewards/accuracy_reward/std": 0.41684433817863464, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1553.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1150.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 703.0, "completions/min_terminated_length": 0.0, "epoch": 0.06913032346163694, "frac_reward_zero_std": 0.6875, "grad_norm": 1.5454549309852843, "kl": 1.630859375, "learning_rate": 1.3788395904436863e-05, "loss": 0.0653, "num_tokens": 90525923.0, "reward": 2.15234375, "reward_std": 0.14894512295722961, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.3600577116012573, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1028.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 817.0, "completions/min_terminated_length": 0.0, "epoch": 0.06930101561833234, "frac_reward_zero_std": 0.6875, "grad_norm": 2.9755878790862225, "kl": 1.607421875, "learning_rate": 1.3822525597269625e-05, "loss": 0.0643, "num_tokens": 90829843.0, "reward": 2.078125, "reward_std": 0.13367824256420135, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1192.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 973.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 768.0, "completions/min_terminated_length": 0.0, "epoch": 0.06947170777502774, "frac_reward_zero_std": 0.875, "grad_norm": 2.5881776470923583, "kl": 1.169921875, "learning_rate": 1.385665529010239e-05, "loss": 0.0468, "num_tokens": 91116467.0, "reward": 2.09765625, "reward_std": 0.05336953327059746, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29743078351020813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1127.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 991.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 788.0, "completions/min_terminated_length": 0.0, "epoch": 0.06964239993172314, "frac_reward_zero_std": 0.375, "grad_norm": 13706.626023778286, "kl": 845.5, "learning_rate": 1.3890784982935156e-05, "loss": 33.8014, "num_tokens": 91410947.0, "reward": 2.2294921875, "reward_std": 0.22492024302482605, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42443734407424927, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1532.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1231.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 855.0, "completions/min_terminated_length": 0.0, "epoch": 0.06981309208841854, "frac_reward_zero_std": 0.625, "grad_norm": 1896.6961145972086, "kl": 97.5, "learning_rate": 1.392491467576792e-05, "loss": 3.9101, "num_tokens": 91765555.0, "reward": 2.11328125, "reward_std": 0.14006631076335907, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1184.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 970.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 719.0, "completions/min_terminated_length": 0.0, "epoch": 0.06998378424511394, "frac_reward_zero_std": 0.8125, "grad_norm": 3.800067114053644, "kl": 1.53125, "learning_rate": 1.3959044368600683e-05, "loss": 0.0613, "num_tokens": 92053843.0, "reward": 2.02734375, "reward_std": 0.06116959825158119, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1277.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1022.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 773.0, "completions/min_terminated_length": 0.0, "epoch": 0.07015447640180934, "frac_reward_zero_std": 0.5625, "grad_norm": 1.9416662612125477, "kl": 1.41796875, "learning_rate": 1.3993174061433447e-05, "loss": 0.0567, "num_tokens": 92356531.0, "reward": 2.1396484375, "reward_std": 0.16995467245578766, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35231640934944153, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1611.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1164.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 821.0, "completions/min_terminated_length": 0.0, "epoch": 0.07032516855850474, "frac_reward_zero_std": 0.4375, "grad_norm": 0.694708230976691, "kl": 1.201171875, "learning_rate": 1.4027303754266213e-05, "loss": 0.048, "num_tokens": 92704627.0, "reward": 2.2275390625, "reward_std": 0.24356648325920105, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42443734407424927, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1258.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1001.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 716.0, "completions/min_terminated_length": 0.0, "epoch": 0.07049586071520014, "frac_reward_zero_std": 0.8125, "grad_norm": 33.53570438221579, "kl": 2.10546875, "learning_rate": 1.4061433447098978e-05, "loss": 0.0842, "num_tokens": 93003827.0, "reward": 2.01953125, "reward_std": 0.058320626616477966, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1476.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1044.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 700.0, "completions/min_terminated_length": 0.0, "epoch": 0.07066655287189554, "frac_reward_zero_std": 0.8125, "grad_norm": 3.9033782343076147, "kl": 0.8759765625, "learning_rate": 1.4095563139931743e-05, "loss": 0.035, "num_tokens": 93309299.0, "reward": 2.14453125, "reward_std": 0.0813203826546669, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35231640934944153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1260.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 987.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 717.0, "completions/min_terminated_length": 0.0, "epoch": 0.07083724502859094, "frac_reward_zero_std": 0.6875, "grad_norm": 1.8008353514169662, "kl": 0.7041015625, "learning_rate": 1.4129692832764506e-05, "loss": 0.0282, "num_tokens": 93598675.0, "reward": 2.1669921875, "reward_std": 0.1385534107685089, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1086.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 895.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 642.0, "completions/min_terminated_length": 0.0, "epoch": 0.07100793718528634, "frac_reward_zero_std": 0.75, "grad_norm": 651.1733372201548, "kl": 37.28125, "learning_rate": 1.4163822525597271e-05, "loss": 1.4892, "num_tokens": 93869923.0, "reward": 2.0966796875, "reward_std": 0.08203125, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1549.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 972.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 528.0, "completions/min_terminated_length": 0.0, "epoch": 0.07117862934198174, "frac_reward_zero_std": 0.6875, "grad_norm": 507.00338839138226, "kl": 31.875, "learning_rate": 1.4197952218430035e-05, "loss": 1.2775, "num_tokens": 94160355.0, "reward": 2.171875, "reward_std": 0.13258779048919678, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1432.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1044.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 751.0, "completions/min_terminated_length": 0.0, "epoch": 0.07134932149867713, "frac_reward_zero_std": 0.625, "grad_norm": 12.103119815480841, "kl": 1.73046875, "learning_rate": 1.42320819112628e-05, "loss": 0.0692, "num_tokens": 94466515.0, "reward": 2.1083984375, "reward_std": 0.16706499457359314, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1110.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 937.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 811.0, "completions/min_terminated_length": 0.0, "epoch": 0.07152001365537254, "frac_reward_zero_std": 0.75, "grad_norm": 3.1137199621378464, "kl": 1.298828125, "learning_rate": 1.4266211604095564e-05, "loss": 0.052, "num_tokens": 94744227.0, "reward": 2.06640625, "reward_std": 0.09452171623706818, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1309.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 871.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 591.0, "completions/min_terminated_length": 0.0, "epoch": 0.07169070581206793, "frac_reward_zero_std": 0.625, "grad_norm": 0.9868612781426003, "kl": 1.56640625, "learning_rate": 1.4300341296928328e-05, "loss": 0.0626, "num_tokens": 95004451.0, "reward": 2.1083984375, "reward_std": 0.12362252175807953, "rewards/accuracy_reward/mean": 0.12083332985639572, "rewards/accuracy_reward/std": 0.326614648103714, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 841.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 686.0, "completions/min_terminated_length": 0.0, "epoch": 0.07186139796876334, "frac_reward_zero_std": 0.5, "grad_norm": 1.6671762202092804, "kl": 1.609375, "learning_rate": 1.4334470989761093e-05, "loss": 0.0643, "num_tokens": 95261619.0, "reward": 2.1865234375, "reward_std": 0.2082703709602356, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1113.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 566.0, "completions/min_terminated_length": 0.0, "epoch": 0.07203209012545873, "frac_reward_zero_std": 0.4375, "grad_norm": 2.5484439499129996, "kl": 1.314453125, "learning_rate": 1.4368600682593859e-05, "loss": 0.0526, "num_tokens": 95587443.0, "reward": 2.0966796875, "reward_std": 0.22745294868946075, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31272050738334656, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.08950243145227432, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1137.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 860.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 519.0, "completions/min_terminated_length": 0.0, "epoch": 0.07220278228215414, "frac_reward_zero_std": 0.5625, "grad_norm": 2.1727762796493617, "kl": 1.208984375, "learning_rate": 1.4402730375426624e-05, "loss": 0.0484, "num_tokens": 95850995.0, "reward": 2.1181640625, "reward_std": 0.14860346913337708, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3268752694129944, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1294.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 973.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 552.0, "completions/min_terminated_length": 0.0, "epoch": 0.07237347443884953, "frac_reward_zero_std": 0.8125, "grad_norm": 0.8520390313187038, "kl": 0.755859375, "learning_rate": 1.4436860068259386e-05, "loss": 0.0302, "num_tokens": 96144675.0, "reward": 2.13671875, "reward_std": 0.07756631821393967, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.34422317147254944, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1475.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1075.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 646.0, "completions/min_terminated_length": 0.0, "epoch": 0.07254416659554494, "frac_reward_zero_std": 0.6875, "grad_norm": 49.10511517918696, "kl": 3.59765625, "learning_rate": 1.4470989761092152e-05, "loss": 0.1442, "num_tokens": 96470131.0, "reward": 2.11328125, "reward_std": 0.11234626173973083, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1458.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1133.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 849.0, "completions/min_terminated_length": 0.0, "epoch": 0.07271485875224033, "frac_reward_zero_std": 0.5625, "grad_norm": 963.5280187879081, "kl": 65.21875, "learning_rate": 1.4505119453924915e-05, "loss": 2.6071, "num_tokens": 96802915.0, "reward": 2.076171875, "reward_std": 0.17386090755462646, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1218.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 937.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 791.0, "completions/min_terminated_length": 0.0, "epoch": 0.07288555090893574, "frac_reward_zero_std": 0.75, "grad_norm": 191.20321948097725, "kl": 13.4375, "learning_rate": 1.4539249146757681e-05, "loss": 0.5372, "num_tokens": 97088147.0, "reward": 2.03125, "reward_std": 0.08351518213748932, "rewards/accuracy_reward/mean": 0.03333333507180214, "rewards/accuracy_reward/std": 0.17988063395023346, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1376.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 972.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 777.0, "completions/min_terminated_length": 0.0, "epoch": 0.07305624306563113, "frac_reward_zero_std": 0.8125, "grad_norm": 9.497825270847448, "kl": 1.0986328125, "learning_rate": 1.4573378839590445e-05, "loss": 0.0439, "num_tokens": 97372323.0, "reward": 2.0380859375, "reward_std": 0.08401669561862946, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1428.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1143.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 760.0, "completions/min_terminated_length": 0.0, "epoch": 0.07322693522232654, "frac_reward_zero_std": 0.75, "grad_norm": 2.9031603708064853, "kl": 0.8095703125, "learning_rate": 1.4607508532423209e-05, "loss": 0.0324, "num_tokens": 97707635.0, "reward": 2.0458984375, "reward_std": 0.09237252175807953, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21998079121112823, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1260.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1104.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 842.0, "completions/min_terminated_length": 0.0, "epoch": 0.07339762737902193, "frac_reward_zero_std": 0.5625, "grad_norm": 2.7848255408215032, "kl": 1.12109375, "learning_rate": 1.4641638225255974e-05, "loss": 0.0448, "num_tokens": 98032835.0, "reward": 2.0966796875, "reward_std": 0.13902148604393005, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1693.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1129.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 864.0, "completions/min_terminated_length": 0.0, "epoch": 0.07356831953571734, "frac_reward_zero_std": 0.75, "grad_norm": 2.471762573665547, "kl": 1.203125, "learning_rate": 1.467576791808874e-05, "loss": 0.0481, "num_tokens": 98359747.0, "reward": 2.0859375, "reward_std": 0.10684756934642792, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1350.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1098.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 715.0, "completions/min_terminated_length": 0.0, "epoch": 0.07373901169241273, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7291503724733597, "kl": 1.330078125, "learning_rate": 1.4709897610921502e-05, "loss": 0.0532, "num_tokens": 98684051.0, "reward": 2.1787109375, "reward_std": 0.13814318180084229, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.387910932302475, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1537.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1127.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 781.0, "completions/min_terminated_length": 0.0, "epoch": 0.07390970384910814, "frac_reward_zero_std": 0.5, "grad_norm": 1.270555797841962, "kl": 1.1328125, "learning_rate": 1.4744027303754267e-05, "loss": 0.0452, "num_tokens": 99013331.0, "reward": 2.171875, "reward_std": 0.1986129879951477, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1518.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1099.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 865.0, "completions/min_terminated_length": 0.0, "epoch": 0.07408039600580353, "frac_reward_zero_std": 0.6875, "grad_norm": 7.142415827463973, "kl": 0.8701171875, "learning_rate": 1.4778156996587032e-05, "loss": 0.0348, "num_tokens": 99335875.0, "reward": 2.0927734375, "reward_std": 0.12779080867767334, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29743078351020813, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1441.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1201.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 847.0, "completions/min_terminated_length": 0.0, "epoch": 0.07425108816249894, "frac_reward_zero_std": 0.5625, "grad_norm": 10.903817351698583, "kl": 1.1328125, "learning_rate": 1.4812286689419796e-05, "loss": 0.0452, "num_tokens": 99680451.0, "reward": 2.173828125, "reward_std": 0.1806240826845169, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.387910932302475, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1769.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1315.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1046.0, "completions/min_terminated_length": 0.0, "epoch": 0.07442178031919433, "frac_reward_zero_std": 0.875, "grad_norm": 6.05399875689036, "kl": 0.9482421875, "learning_rate": 1.4846416382252562e-05, "loss": 0.0378, "num_tokens": 100060163.0, "reward": 2.0537109375, "reward_std": 0.03515625, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1660.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1224.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 854.0, "completions/min_terminated_length": 0.0, "epoch": 0.07459247247588974, "frac_reward_zero_std": 0.6875, "grad_norm": 1.9682473952098085, "kl": 0.7314453125, "learning_rate": 1.4880546075085325e-05, "loss": 0.0293, "num_tokens": 100418227.0, "reward": 2.0576171875, "reward_std": 0.12724322080612183, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1482.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 1068.0, "completions/min_terminated_length": 0.0, "epoch": 0.07476316463258513, "frac_reward_zero_std": 0.5625, "grad_norm": 1.4016548458325775, "kl": 1.068359375, "learning_rate": 1.491467576791809e-05, "loss": 0.0427, "num_tokens": 100849523.0, "reward": 2.18359375, "reward_std": 0.19074104726314545, "rewards/accuracy_reward/mean": 0.19921875, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03814799711108208, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1983.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1379.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 822.0, "completions/min_terminated_length": 0.0, "epoch": 0.07493385678928054, "frac_reward_zero_std": 0.625, "grad_norm": 3.77879966471427, "kl": 1.1484375, "learning_rate": 1.4948805460750855e-05, "loss": 0.0459, "num_tokens": 101246355.0, "reward": 2.173828125, "reward_std": 0.1627182811498642, "rewards/accuracy_reward/mean": 0.18359375, "rewards/accuracy_reward/std": 0.387910932302475, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1528.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1161.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 877.0, "completions/min_terminated_length": 0.0, "epoch": 0.07510454894597593, "frac_reward_zero_std": 0.75, "grad_norm": 2.87306985771049, "kl": 1.08203125, "learning_rate": 1.498293515358362e-05, "loss": 0.0434, "num_tokens": 101586547.0, "reward": 2.1083984375, "reward_std": 0.11075381934642792, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1534.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1033.0, "completions/min_terminated_length": 0.0, "epoch": 0.07527524110267134, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6063722968827168, "kl": 1.048828125, "learning_rate": 1.5017064846416382e-05, "loss": 0.0419, "num_tokens": 102019219.0, "reward": 2.09765625, "reward_std": 0.15992596745491028, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.3222736418247223, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1446.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 939.0, "completions/min_terminated_length": 0.0, "epoch": 0.07544593325936673, "frac_reward_zero_std": 0.375, "grad_norm": 3.3012428470351924, "kl": 1.0244140625, "learning_rate": 1.5051194539249148e-05, "loss": 0.041, "num_tokens": 102428099.0, "reward": 2.0859375, "reward_std": 0.26219508051872253, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06902241706848145, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1542.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1118.0, "completions/min_terminated_length": 0.0, "epoch": 0.07561662541606214, "frac_reward_zero_std": 0.4375, "grad_norm": 4.770482135747039, "kl": 1.125, "learning_rate": 1.5085324232081913e-05, "loss": 0.0451, "num_tokens": 102864003.0, "reward": 2.0517578125, "reward_std": 0.2103152722120285, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31272050738334656, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21178513765335083, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.05079597979784012, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1324.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 852.0, "completions/min_terminated_length": 0.0, "epoch": 0.07578731757275753, "frac_reward_zero_std": 0.4375, "grad_norm": 0.9676582714267524, "kl": 1.240234375, "learning_rate": 1.5119453924914677e-05, "loss": 0.0496, "num_tokens": 103246371.0, "reward": 2.0732421875, "reward_std": 0.27525758743286133, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.3600577116012573, "rewards/format_reward/mean": 0.93359375, "rewards/format_reward/std": 0.24947863817214966, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.05928463488817215, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1280.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 948.0, "completions/min_terminated_length": 0.0, "epoch": 0.07595800972945294, "frac_reward_zero_std": 0.75, "grad_norm": 1.481586198603652, "kl": 1.34765625, "learning_rate": 1.515358361774744e-05, "loss": 0.0539, "num_tokens": 103615459.0, "reward": 2.01953125, "reward_std": 0.10112476348876953, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.05805254727602005, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1412.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 714.0, "completions/min_terminated_length": 0.0, "epoch": 0.07612870188614833, "frac_reward_zero_std": 0.0, "grad_norm": 1.4595512756262428, "kl": 1.40625, "learning_rate": 1.5187713310580206e-05, "loss": 0.0562, "num_tokens": 104019843.0, "reward": 1.3330078125, "reward_std": 0.5535579919815063, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22781464457511902, "rewards/format_reward/mean": 0.41796875, "rewards/format_reward/std": 0.49419113993644714, "rewards/tag_count_reward/mean": 0.8603515625, "rewards/tag_count_reward/std": 0.15107548236846924, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.07629939404284374, "frac_reward_zero_std": 0.0, "grad_norm": 6.96950376331257, "kl": 0.9140625, "learning_rate": 1.522184300341297e-05, "loss": 0.0365, "num_tokens": 104586275.0, "reward": 0.4521484375, "reward_std": 0.21957182884216309, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31272050738334656, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3427734375, "rewards/tag_count_reward/std": 0.17555660009384155, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.07647008619953913, "frac_reward_zero_std": 0.125, "grad_norm": 4.178536022143903, "kl": 0.650390625, "learning_rate": 1.5255972696245735e-05, "loss": 0.026, "num_tokens": 105151155.0, "reward": 0.111328125, "reward_std": 0.14097259938716888, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.044921875, "rewards/tag_count_reward/std": 0.11040741950273514, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.07664077835623453, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8272619325720302, "kl": 1.052734375, "learning_rate": 1.52901023890785e-05, "loss": 0.0421, "num_tokens": 105721555.0, "reward": 0.0810546875, "reward_std": 0.13679033517837524, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0654296875, "rewards/tag_count_reward/std": 0.11659470945596695, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.07681147051292993, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7605372559329074, "kl": 1.83203125, "learning_rate": 1.5324232081911263e-05, "loss": 0.0733, "num_tokens": 106286275.0, "reward": 0.072265625, "reward_std": 0.12561938166618347, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.009765625, "rewards/tag_count_reward/std": 0.05334262177348137, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 1994.15234375, "completions/mean_terminated_length": 515.5, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.07698216266962533, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3448981397919408, "kl": 2.41796875, "learning_rate": 1.5358361774744027e-05, "loss": 0.0986, "num_tokens": 106839946.0, "reward": 0.22265625, "reward_std": 0.1901227980852127, "rewards/accuracy_reward/mean": 0.22265625, "rewards/accuracy_reward/std": 0.41684433817863464, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 2030.1015625, "completions/mean_terminated_length": 520.6666870117188, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.07715285482632073, "frac_reward_zero_std": 0.75, "grad_norm": 0.40741419864542855, "kl": 2.57421875, "learning_rate": 1.5392491467576794e-05, "loss": 0.103, "num_tokens": 107405604.0, "reward": 0.0703125, "reward_std": 0.09579971432685852, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.26394182443618774, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 1900.1484375, "completions/mean_terminated_length": 489.9130554199219, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.07732354698301613, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3070023762228077, "kl": 2.375, "learning_rate": 1.5426621160409558e-05, "loss": 0.0904, "num_tokens": 107937450.0, "reward": 0.0908203125, "reward_std": 0.1506931483745575, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1924.0, "completions/mean_length": 1265.03125, "completions/mean_terminated_length": 758.4832153320312, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.07749423913971153, "frac_reward_zero_std": 0.625, "grad_norm": 0.21840724109674958, "kl": 1.54296875, "learning_rate": 1.546075085324232e-05, "loss": 0.0582, "num_tokens": 108300690.0, "reward": 0.099609375, "reward_std": 0.10398616641759872, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29743078351020813, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.001953125, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1754.0, "completions/mean_length": 750.23046875, "completions/mean_terminated_length": 715.7701416015625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.07766493129640693, "frac_reward_zero_std": 0.625, "grad_norm": 0.30132928201138715, "kl": 0.32861328125, "learning_rate": 1.5494880546075085e-05, "loss": 0.0174, "num_tokens": 108529581.0, "reward": 0.09375, "reward_std": 0.12470625340938568, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00390625, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 653.4140625, "completions/mean_terminated_length": 630.0597534179688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.07783562345310233, "frac_reward_zero_std": 0.5625, "grad_norm": 0.9927451629150513, "kl": 0.29638671875, "learning_rate": 1.5529010238907852e-05, "loss": 0.0046, "num_tokens": 108736855.0, "reward": 0.05859375, "reward_std": 0.10341504216194153, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22781464457511902, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00390625, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 1259.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 507.08984375, "completions/mean_terminated_length": 506.4510192871094, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.07800631560979773, "frac_reward_zero_std": 0.625, "grad_norm": 0.5162231558817547, "kl": 0.264404296875, "learning_rate": 1.5563139931740616e-05, "loss": 0.0146, "num_tokens": 108915310.0, "reward": 0.044921875, "reward_std": 0.06730774790048599, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20024390518665314, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.005859375, "rewards/tag_count_reward/std": 0.03789619356393814, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 435.2734375, "completions/mean_terminated_length": 433.556884765625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.07817700776649313, "frac_reward_zero_std": 0.1875, "grad_norm": 0.5702448530686923, "kl": 0.3525390625, "learning_rate": 1.559726962457338e-05, "loss": 0.0028, "num_tokens": 109068356.0, "reward": 0.0458984375, "reward_std": 0.10921958833932877, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0302734375, "rewards/tag_count_reward/std": 0.0817188173532486, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -6.96875, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 229.3125, "completions/mean_terminated_length": 228.2470703125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.07834769992318853, "frac_reward_zero_std": 0.0, "grad_norm": 3.950839917658121, "kl": 0.9482421875, "learning_rate": 1.5631399317406144e-05, "loss": -0.062, "num_tokens": 109172964.0, "reward": 0.09375, "reward_std": 0.1357371211051941, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.08984375, "rewards/tag_count_reward/std": 0.12420088052749634, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -5.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1382.0, "completions/mean_length": 524.71875, "completions/mean_terminated_length": 271.6875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.07851839207988393, "frac_reward_zero_std": 0.0, "grad_norm": 2.3211848629670824, "kl": 1.662109375, "learning_rate": 1.5665529010238908e-05, "loss": 0.1581, "num_tokens": 109350924.0, "reward": 0.103515625, "reward_std": 0.1427963376045227, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.103515625, "rewards/tag_count_reward/std": 0.1435794085264206, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1379.1484375, "completions/mean_terminated_length": 644.5081787109375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.07868908423657933, "frac_reward_zero_std": 0.0, "grad_norm": 0.5594655656391367, "kl": 0.8681640625, "learning_rate": 1.5699658703071675e-05, "loss": 0.069, "num_tokens": 109743970.0, "reward": 0.205078125, "reward_std": 0.1919827163219452, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.205078125, "rewards/tag_count_reward/std": 0.1978650689125061, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 2040.58203125, "completions/mean_terminated_length": 149.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.07885977639327472, "frac_reward_zero_std": 0.0, "grad_norm": 4.840114963305155, "kl": 0.9267578125, "learning_rate": 1.573378839590444e-05, "loss": 0.0389, "num_tokens": 110308359.0, "reward": 0.2705078125, "reward_std": 0.20974251627922058, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2666015625, "rewards/tag_count_reward/std": 0.21228858828544617, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 2040.66015625, "completions/mean_terminated_length": 169.0, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.07903046854997013, "frac_reward_zero_std": 0.0, "grad_norm": 0.5817347839103797, "kl": 0.5263671875, "learning_rate": 1.5767918088737202e-05, "loss": 0.0235, "num_tokens": 110875936.0, "reward": 0.259765625, "reward_std": 0.18860194087028503, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.259765625, "rewards/tag_count_reward/std": 0.19654452800750732, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.07920116070666552, "frac_reward_zero_std": 0.0, "grad_norm": 0.31650948356282016, "kl": 0.1895751953125, "learning_rate": 1.5802047781569966e-05, "loss": 0.0076, "num_tokens": 111440512.0, "reward": 0.33203125, "reward_std": 0.17744939029216766, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.33203125, "rewards/tag_count_reward/std": 0.1801670640707016, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.07937185286336093, "frac_reward_zero_std": 0.0, "grad_norm": 0.16969795085281061, "kl": 0.130859375, "learning_rate": 1.5836177474402733e-05, "loss": 0.0052, "num_tokens": 112004016.0, "reward": 0.365234375, "reward_std": 0.17698949575424194, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.365234375, "rewards/tag_count_reward/std": 0.1823118031024933, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.07954254502005632, "frac_reward_zero_std": 0.0, "grad_norm": 0.2266186312947074, "kl": 0.131103515625, "learning_rate": 1.5870307167235497e-05, "loss": 0.0052, "num_tokens": 112570544.0, "reward": 0.369140625, "reward_std": 0.1827990710735321, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.369140625, "rewards/tag_count_reward/std": 0.18646569550037384, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.07971323717675173, "frac_reward_zero_std": 0.0, "grad_norm": 0.23058261645125014, "kl": 0.22216796875, "learning_rate": 1.590443686006826e-05, "loss": 0.0089, "num_tokens": 113135200.0, "reward": 0.3974609375, "reward_std": 0.18557801842689514, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3974609375, "rewards/tag_count_reward/std": 0.18912440538406372, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.07988392933344712, "frac_reward_zero_std": 0.0, "grad_norm": 0.3135469617402567, "kl": 0.237548828125, "learning_rate": 1.5938566552901024e-05, "loss": 0.0095, "num_tokens": 113705632.0, "reward": 0.4677734375, "reward_std": 0.19181233644485474, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4677734375, "rewards/tag_count_reward/std": 0.19348841905593872, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.08005462149014253, "frac_reward_zero_std": 0.0, "grad_norm": 0.3929395643839833, "kl": 0.4208984375, "learning_rate": 1.5972696245733788e-05, "loss": 0.0168, "num_tokens": 114269328.0, "reward": 0.4443359375, "reward_std": 0.19484853744506836, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4404296875, "rewards/tag_count_reward/std": 0.18686319887638092, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.08022531364683792, "frac_reward_zero_std": 0.0, "grad_norm": 0.4425717804435337, "kl": 0.44287109375, "learning_rate": 1.6006825938566555e-05, "loss": 0.0177, "num_tokens": 114832512.0, "reward": 0.4814453125, "reward_std": 0.18614359200000763, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4775390625, "rewards/tag_count_reward/std": 0.17776775360107422, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.08039600580353333, "frac_reward_zero_std": 0.0, "grad_norm": 0.42161493390286686, "kl": 0.57275390625, "learning_rate": 1.604095563139932e-05, "loss": 0.0229, "num_tokens": 115395504.0, "reward": 0.580078125, "reward_std": 0.16349345445632935, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.580078125, "rewards/tag_count_reward/std": 0.1684228926897049, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.08056669796022872, "frac_reward_zero_std": 0.0, "grad_norm": 0.26836303185840765, "kl": 0.7373046875, "learning_rate": 1.6075085324232083e-05, "loss": 0.0296, "num_tokens": 115961104.0, "reward": 0.6103515625, "reward_std": 0.17779892683029175, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6103515625, "rewards/tag_count_reward/std": 0.17926941812038422, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.08073739011692413, "frac_reward_zero_std": 0.0, "grad_norm": 0.2382419714852144, "kl": 0.9033203125, "learning_rate": 1.6109215017064847e-05, "loss": 0.0361, "num_tokens": 116536432.0, "reward": 0.6591796875, "reward_std": 0.2025543451309204, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.6513671875, "rewards/tag_count_reward/std": 0.18600043654441833, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.08090808227361952, "frac_reward_zero_std": 0.0, "grad_norm": 0.2547678864032452, "kl": 1.5859375, "learning_rate": 1.6143344709897614e-05, "loss": 0.0634, "num_tokens": 117098400.0, "reward": 0.654296875, "reward_std": 0.24383780360221863, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.01171875, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.642578125, "rewards/tag_count_reward/std": 0.22400891780853271, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.08107877443031493, "frac_reward_zero_std": 0.0, "grad_norm": 0.2708560810692859, "kl": 1.365234375, "learning_rate": 1.6177474402730378e-05, "loss": 0.0546, "num_tokens": 117666048.0, "reward": 0.6923828125, "reward_std": 0.32399290800094604, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0546875, "rewards/format_reward/std": 0.22781464457511902, "rewards/tag_count_reward/mean": 0.6376953125, "rewards/tag_count_reward/std": 0.22103802859783173, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.08124946658701032, "frac_reward_zero_std": 0.0, "grad_norm": 0.2904710566824543, "kl": 0.896484375, "learning_rate": 1.621160409556314e-05, "loss": 0.0359, "num_tokens": 118226672.0, "reward": 0.7099609375, "reward_std": 0.3175894618034363, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.0390625, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.6591796875, "rewards/tag_count_reward/std": 0.22752659022808075, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.08142015874370573, "frac_reward_zero_std": 0.0, "grad_norm": 0.29514957895140975, "kl": 1.16796875, "learning_rate": 1.6245733788395905e-05, "loss": 0.0467, "num_tokens": 118791872.0, "reward": 0.8154296875, "reward_std": 0.3986031413078308, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.08203125, "rewards/format_reward/std": 0.2749498784542084, "rewards/tag_count_reward/mean": 0.7021484375, "rewards/tag_count_reward/std": 0.23095130920410156, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.08159085090040112, "frac_reward_zero_std": 0.0, "grad_norm": 0.4118951310944225, "kl": 1.4296875, "learning_rate": 1.627986348122867e-05, "loss": 0.0572, "num_tokens": 119361552.0, "reward": 0.9599609375, "reward_std": 0.4736751914024353, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.2421875, "rewards/format_reward/std": 0.4292463958263397, "rewards/tag_count_reward/mean": 0.7138671875, "rewards/tag_count_reward/std": 0.20027698576450348, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.08176154305709653, "frac_reward_zero_std": 0.0, "grad_norm": 0.21062260187459247, "kl": 2.4609375, "learning_rate": 1.6313993174061436e-05, "loss": 0.0984, "num_tokens": 119926176.0, "reward": 1.099609375, "reward_std": 0.5565367341041565, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.49482619762420654, "rewards/tag_count_reward/mean": 0.673828125, "rewards/tag_count_reward/std": 0.1969338357448578, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.08193223521379192, "frac_reward_zero_std": 0.0, "grad_norm": 0.1535994126633624, "kl": 2.66796875, "learning_rate": 1.63481228668942e-05, "loss": 0.1067, "num_tokens": 120489232.0, "reward": 1.37890625, "reward_std": 0.5197365283966064, "rewards/accuracy_reward/mean": 0.02916666679084301, "rewards/accuracy_reward/std": 0.16862517595291138, "rewards/format_reward/mean": 0.69921875, "rewards/format_reward/std": 0.45949608087539673, "rewards/tag_count_reward/mean": 0.65234375, "rewards/tag_count_reward/std": 0.19360961019992828, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.08210292737048733, "frac_reward_zero_std": 0.0, "grad_norm": 0.17997894887344754, "kl": 2.56640625, "learning_rate": 1.6382252559726964e-05, "loss": 0.1027, "num_tokens": 121048416.0, "reward": 1.3974609375, "reward_std": 0.4115820527076721, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.82421875, "rewards/format_reward/std": 0.3813795745372772, "rewards/tag_count_reward/mean": 0.5576171875, "rewards/tag_count_reward/std": 0.17107099294662476, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.08227361952718272, "frac_reward_zero_std": 0.0, "grad_norm": 0.17424232171098633, "kl": 2.69921875, "learning_rate": 1.6416382252559727e-05, "loss": 0.1081, "num_tokens": 121616800.0, "reward": 1.484375, "reward_std": 0.2318345308303833, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.22781464457511902, "rewards/tag_count_reward/mean": 0.53515625, "rewards/tag_count_reward/std": 0.1390950232744217, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1633.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 958.0, "completions/min_terminated_length": 0.0, "epoch": 0.08244431168387813, "frac_reward_zero_std": 0.0, "grad_norm": 0.1971188422421911, "kl": 3.125, "learning_rate": 1.6450511945392495e-05, "loss": 0.1251, "num_tokens": 122076336.0, "reward": 1.5595703125, "reward_std": 0.1695912480354309, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.5830078125, "rewards/tag_count_reward/std": 0.12205084413290024, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1326.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1052.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 694.0, "completions/min_terminated_length": 0.0, "epoch": 0.08261500384057352, "frac_reward_zero_std": 0.0, "grad_norm": 0.24635029940918923, "kl": 3.3828125, "learning_rate": 1.648464163822526e-05, "loss": 0.1354, "num_tokens": 122388544.0, "reward": 1.705078125, "reward_std": 0.21946144104003906, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.728515625, "rewards/tag_count_reward/std": 0.1486123651266098, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1121.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 803.0, "completions/min_terminated_length": 0.0, "epoch": 0.08278569599726893, "frac_reward_zero_std": 0.0, "grad_norm": 0.2686181442554733, "kl": 3.34765625, "learning_rate": 1.6518771331058022e-05, "loss": 0.1341, "num_tokens": 122714480.0, "reward": 1.7255859375, "reward_std": 0.37860578298568726, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33136674761772156, "rewards/tag_count_reward/mean": 0.8193359375, "rewards/tag_count_reward/std": 0.1322028636932373, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1206.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1006.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 831.0, "completions/min_terminated_length": 0.0, "epoch": 0.08295638815396432, "frac_reward_zero_std": 0.0, "grad_norm": 0.27150779884743553, "kl": 3.1015625, "learning_rate": 1.6552901023890786e-05, "loss": 0.1241, "num_tokens": 123018288.0, "reward": 1.703125, "reward_std": 0.5054073333740234, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 0.7890625, "rewards/format_reward/std": 0.4087733030319214, "rewards/tag_count_reward/mean": 0.8359375, "rewards/tag_count_reward/std": 0.1579200029373169, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1420.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1086.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 724.0, "completions/min_terminated_length": 0.0, "epoch": 0.08312708031065973, "frac_reward_zero_std": 0.0, "grad_norm": 0.2396606588665472, "kl": 3.21484375, "learning_rate": 1.658703071672355e-05, "loss": 0.1284, "num_tokens": 123340432.0, "reward": 1.7646484375, "reward_std": 0.3446662127971649, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.85546875, "rewards/format_reward/std": 0.35231640934944153, "rewards/tag_count_reward/mean": 0.8193359375, "rewards/tag_count_reward/std": 0.21879158914089203, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1192.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1034.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 890.0, "completions/min_terminated_length": 0.0, "epoch": 0.08329777246735512, "frac_reward_zero_std": 0.0, "grad_norm": 0.2854273869529426, "kl": 3.0078125, "learning_rate": 1.6621160409556317e-05, "loss": 0.1202, "num_tokens": 123645472.0, "reward": 1.83984375, "reward_std": 0.3400724530220032, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24253563582897186, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.1398090422153473, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1342.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1000.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 806.0, "completions/min_terminated_length": 0.0, "epoch": 0.08346846462405053, "frac_reward_zero_std": 0.1875, "grad_norm": 0.21836064657414453, "kl": 3.22265625, "learning_rate": 1.6655290102389077e-05, "loss": 0.1288, "num_tokens": 123942128.0, "reward": 1.9599609375, "reward_std": 0.19821134209632874, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.9677734375, "rewards/tag_count_reward/std": 0.09228325635194778, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1194.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 986.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 810.0, "completions/min_terminated_length": 0.0, "epoch": 0.08363915678074592, "frac_reward_zero_std": 0.375, "grad_norm": 0.2051695255251561, "kl": 3.109375, "learning_rate": 1.6689419795221844e-05, "loss": 0.1243, "num_tokens": 124234000.0, "reward": 1.9326171875, "reward_std": 0.2511597275733948, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24253563582897186, "rewards/tag_count_reward/mean": 0.9833984375, "rewards/tag_count_reward/std": 0.06978800892829895, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1165.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1006.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 824.0, "completions/min_terminated_length": 0.0, "epoch": 0.08380984893744133, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7780742063953088, "kl": 3.26953125, "learning_rate": 1.6723549488054608e-05, "loss": 0.131, "num_tokens": 124533760.0, "reward": 1.966796875, "reward_std": 0.1892889142036438, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.048530805855989456, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1034.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 742.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 602.0, "completions/min_terminated_length": 0.0, "epoch": 0.08398054109413672, "frac_reward_zero_std": 0.0, "grad_norm": 0.3956546494103436, "kl": 3.7109375, "learning_rate": 1.6757679180887375e-05, "loss": 0.1484, "num_tokens": 124766512.0, "reward": 0.9501953125, "reward_std": 0.33055561780929565, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.05859375, "rewards/format_reward/std": 0.23532284796237946, "rewards/tag_count_reward/mean": 0.8564453125, "rewards/tag_count_reward/std": 0.1843043714761734, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 870.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 694.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 544.0, "completions/min_terminated_length": 0.0, "epoch": 0.08415123325083212, "frac_reward_zero_std": 0.0, "grad_norm": 0.3817339426613729, "kl": 3.73046875, "learning_rate": 1.6791808873720136e-05, "loss": 0.149, "num_tokens": 124988112.0, "reward": 1.2490234375, "reward_std": 0.6133817434310913, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.41015625, "rewards/format_reward/std": 0.49282538890838623, "rewards/tag_count_reward/mean": 0.8193359375, "rewards/tag_count_reward/std": 0.1963568925857544, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 775.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 647.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 578.0, "completions/min_terminated_length": 0.0, "epoch": 0.08432192540752752, "frac_reward_zero_std": 0.0, "grad_norm": 0.41021026868276883, "kl": 3.9140625, "learning_rate": 1.6825938566552903e-05, "loss": 0.1563, "num_tokens": 125197424.0, "reward": 1.48828125, "reward_std": 0.5218134522438049, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.80078125, "rewards/format_reward/std": 0.40019527077674866, "rewards/tag_count_reward/mean": 0.6484375, "rewards/tag_count_reward/std": 0.24240928888320923, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1137.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 793.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 541.0, "completions/min_terminated_length": 0.0, "epoch": 0.08449261756422292, "frac_reward_zero_std": 0.0, "grad_norm": 0.43177537404782096, "kl": 3.44921875, "learning_rate": 1.6860068259385667e-05, "loss": 0.138, "num_tokens": 125446080.0, "reward": 1.619140625, "reward_std": 0.5029963254928589, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.8203125, "rewards/format_reward/std": 0.38467901945114136, "rewards/tag_count_reward/mean": 0.791015625, "rewards/tag_count_reward/std": 0.17926140129566193, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1092.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 611.0, "completions/min_terminated_length": 0.0, "epoch": 0.08466330972091832, "frac_reward_zero_std": 0.0, "grad_norm": 0.40512058260888023, "kl": 3.62890625, "learning_rate": 1.689419795221843e-05, "loss": 0.1452, "num_tokens": 125767424.0, "reward": 0.3779296875, "reward_std": 0.18297015130519867, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3505859375, "rewards/tag_count_reward/std": 0.15303994715213776, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1050.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 739.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 567.0, "completions/min_terminated_length": 0.0, "epoch": 0.08483400187761372, "frac_reward_zero_std": 0.0, "grad_norm": 0.4653260433308534, "kl": 3.703125, "learning_rate": 1.6928327645051198e-05, "loss": 0.1479, "num_tokens": 125999600.0, "reward": 0.5205078125, "reward_std": 0.1970268040895462, "rewards/accuracy_reward/mean": 0.02083333395421505, "rewards/accuracy_reward/std": 0.14312462508678436, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5009765625, "rewards/tag_count_reward/std": 0.16641852259635925, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1025.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 491.0, "completions/min_terminated_length": 0.0, "epoch": 0.08500469403430912, "frac_reward_zero_std": 0.0, "grad_norm": 0.5430003787219974, "kl": 3.7421875, "learning_rate": 1.6962457337883958e-05, "loss": 0.1498, "num_tokens": 126305680.0, "reward": 0.544921875, "reward_std": 0.16385304927825928, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.541015625, "rewards/tag_count_reward/std": 0.1589713990688324, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 797.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 612.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 498.0, "completions/min_terminated_length": 0.0, "epoch": 0.08517538619100452, "frac_reward_zero_std": 0.0, "grad_norm": 0.6617456693296199, "kl": 3.71875, "learning_rate": 1.6996587030716725e-05, "loss": 0.1487, "num_tokens": 126501952.0, "reward": 0.431640625, "reward_std": 0.16118672490119934, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.431640625, "rewards/tag_count_reward/std": 0.16778501868247986, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 751.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 582.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 422.0, "completions/min_terminated_length": 0.0, "epoch": 0.08534607834769992, "frac_reward_zero_std": 0.0, "grad_norm": 1.2983164452418954, "kl": 3.59765625, "learning_rate": 1.703071672354949e-05, "loss": 0.1437, "num_tokens": 126689296.0, "reward": 0.640625, "reward_std": 0.14570948481559753, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.640625, "rewards/tag_count_reward/std": 0.14934498071670532, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 892.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 604.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 410.0, "completions/min_terminated_length": 0.0, "epoch": 0.08551677050439532, "frac_reward_zero_std": 0.0, "grad_norm": 1.5281784809268544, "kl": 3.5078125, "learning_rate": 1.7064846416382256e-05, "loss": 0.1405, "num_tokens": 126885600.0, "reward": 0.6259765625, "reward_std": 0.18071767687797546, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6259765625, "rewards/tag_count_reward/std": 0.19926095008850098, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 748.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 620.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 490.0, "completions/min_terminated_length": 0.0, "epoch": 0.08568746266109073, "frac_reward_zero_std": 0.0, "grad_norm": 1.5695844002732051, "kl": 2.78515625, "learning_rate": 1.7098976109215017e-05, "loss": 0.1115, "num_tokens": 127084016.0, "reward": 0.544921875, "reward_std": 0.16602297127246857, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.541015625, "rewards/tag_count_reward/std": 0.1605057716369629, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 913.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 715.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 642.0, "completions/min_terminated_length": 0.0, "epoch": 0.08585815481778612, "frac_reward_zero_std": 0.0, "grad_norm": 8.388570588812778, "kl": 2.62109375, "learning_rate": 1.7133105802047784e-05, "loss": 0.1048, "num_tokens": 127318912.0, "reward": 0.4970703125, "reward_std": 0.1838625967502594, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4931640625, "rewards/tag_count_reward/std": 0.177681565284729, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 927.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 712.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 585.0, "completions/min_terminated_length": 0.0, "epoch": 0.08602884697448153, "frac_reward_zero_std": 0.0, "grad_norm": 384.3230177363959, "kl": 35.875, "learning_rate": 1.7167235494880547e-05, "loss": 1.435, "num_tokens": 127542352.0, "reward": 0.4267578125, "reward_std": 0.2116931676864624, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4033203125, "rewards/tag_count_reward/std": 0.1705777794122696, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1732.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1269.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 853.0, "completions/min_terminated_length": 0.0, "epoch": 0.08619953913117692, "frac_reward_zero_std": 0.0, "grad_norm": 21177.548443678017, "kl": 1912.4375, "learning_rate": 1.720136518771331e-05, "loss": 76.4223, "num_tokens": 127915168.0, "reward": 0.3701171875, "reward_std": 0.1762849986553192, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3544921875, "rewards/tag_count_reward/std": 0.1552017629146576, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1575.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 831.0, "completions/min_terminated_length": 0.0, "epoch": 0.08637023128787233, "frac_reward_zero_std": 0.0, "grad_norm": 641.4995082171478, "kl": 59.40625, "learning_rate": 1.723549488054608e-05, "loss": 2.3807, "num_tokens": 128365136.0, "reward": 0.33203125, "reward_std": 0.1878146529197693, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3203125, "rewards/tag_count_reward/std": 0.16099415719509125, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1890.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1484.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1023.0, "completions/min_terminated_length": 0.0, "epoch": 0.08654092344456772, "frac_reward_zero_std": 0.0, "grad_norm": 5.66254116210531, "kl": 2.5625, "learning_rate": 1.726962457337884e-05, "loss": 0.1026, "num_tokens": 128782816.0, "reward": 0.244140625, "reward_std": 0.1983015239238739, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.228515625, "rewards/tag_count_reward/std": 0.16869552433490753, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1976.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 1473.0, "completions/min_terminated_length": 0.0, "epoch": 0.08671161560126313, "frac_reward_zero_std": 0.0, "grad_norm": 2.6794026530203356, "kl": 2.515625, "learning_rate": 1.7303754266211606e-05, "loss": 0.1006, "num_tokens": 129325728.0, "reward": 0.2998046875, "reward_std": 0.17504863440990448, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2724609375, "rewards/tag_count_reward/std": 0.14084270596504211, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.08688230775795852, "frac_reward_zero_std": 0.0, "grad_norm": 1.4407898629936844, "kl": 1.57421875, "learning_rate": 1.733788395904437e-05, "loss": 0.0629, "num_tokens": 129897040.0, "reward": 0.302734375, "reward_std": 0.1920856237411499, "rewards/accuracy_reward/mean": 0.05833333358168602, "rewards/accuracy_reward/std": 0.23486249148845673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.248046875, "rewards/tag_count_reward/std": 0.17849066853523254, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.08705299991465393, "frac_reward_zero_std": 0.0, "grad_norm": 5.173843939073927, "kl": 1.611328125, "learning_rate": 1.7372013651877137e-05, "loss": 0.0645, "num_tokens": 130463664.0, "reward": 0.3125, "reward_std": 0.1515227109193802, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.30078125, "rewards/tag_count_reward/std": 0.1359764039516449, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1694.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1321.0, "completions/min_terminated_length": 0.0, "epoch": 0.08722369207134932, "frac_reward_zero_std": 0.0, "grad_norm": 4.598774201721938, "kl": 1.654296875, "learning_rate": 1.7406143344709897e-05, "loss": 0.0662, "num_tokens": 130939888.0, "reward": 0.3818359375, "reward_std": 0.23925787210464478, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2919921875, "rewards/tag_count_reward/std": 0.17276397347450256, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1392.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 1132.0, "completions/min_terminated_length": 0.0, "epoch": 0.08739438422804473, "frac_reward_zero_std": 0.0, "grad_norm": 11.506052748088017, "kl": 2.048828125, "learning_rate": 1.7440273037542664e-05, "loss": 0.082, "num_tokens": 131338976.0, "reward": 0.34375, "reward_std": 0.2235952615737915, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3125, "rewards/tag_count_reward/std": 0.19678793847560883, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1640.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1183.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 812.0, "completions/min_terminated_length": 0.0, "epoch": 0.08756507638474012, "frac_reward_zero_std": 0.0, "grad_norm": 5.7802682720567296, "kl": 2.14453125, "learning_rate": 1.7474402730375428e-05, "loss": 0.0859, "num_tokens": 131682432.0, "reward": 0.3740234375, "reward_std": 0.20210641622543335, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3662109375, "rewards/tag_count_reward/std": 0.18896234035491943, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1212.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 864.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 696.0, "completions/min_terminated_length": 0.0, "epoch": 0.08773576854143553, "frac_reward_zero_std": 0.0, "grad_norm": 9.228111612799676, "kl": 2.3203125, "learning_rate": 1.7508532423208192e-05, "loss": 0.0927, "num_tokens": 131944048.0, "reward": 0.388671875, "reward_std": 0.1945350170135498, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.380859375, "rewards/tag_count_reward/std": 0.18247976899147034, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 828.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 690.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 564.0, "completions/min_terminated_length": 0.0, "epoch": 0.08790646069813092, "frac_reward_zero_std": 0.0, "grad_norm": 6.138836320893438, "kl": 1.966796875, "learning_rate": 1.7542662116040956e-05, "loss": 0.0787, "num_tokens": 132160240.0, "reward": 0.4384765625, "reward_std": 0.17020264267921448, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4384765625, "rewards/tag_count_reward/std": 0.17538200318813324, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 669.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 538.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 451.0, "completions/min_terminated_length": 0.0, "epoch": 0.08807715285482633, "frac_reward_zero_std": 0.0, "grad_norm": 2.479398414484662, "kl": 2.94921875, "learning_rate": 1.757679180887372e-05, "loss": 0.1181, "num_tokens": 132333424.0, "reward": 0.5234375, "reward_std": 0.1744753122329712, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.51953125, "rewards/tag_count_reward/std": 0.1746407449245453, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 492.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 417.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 296.0, "completions/min_terminated_length": 0.0, "epoch": 0.08824784501152172, "frac_reward_zero_std": 0.0, "grad_norm": 284536.2980711096, "kl": 24048.0, "learning_rate": 1.7610921501706487e-05, "loss": 962.3273, "num_tokens": 132476848.0, "reward": 0.61328125, "reward_std": 0.1710839867591858, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.60546875, "rewards/tag_count_reward/std": 0.16452360153198242, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 665.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 480.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 408.0, "completions/min_terminated_length": 0.0, "epoch": 0.08841853716821713, "frac_reward_zero_std": 0.0, "grad_norm": 64.80917112170222, "kl": 9.484375, "learning_rate": 1.764505119453925e-05, "loss": 0.3801, "num_tokens": 132643344.0, "reward": 0.533203125, "reward_std": 0.18432581424713135, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.529296875, "rewards/tag_count_reward/std": 0.177457794547081, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 658.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 486.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 342.0, "completions/min_terminated_length": 0.0, "epoch": 0.08858922932491252, "frac_reward_zero_std": 0.0, "grad_norm": 3.416540331060326, "kl": 4.59375, "learning_rate": 1.7679180887372018e-05, "loss": 0.1838, "num_tokens": 132815104.0, "reward": 0.3544921875, "reward_std": 0.18568843603134155, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3427734375, "rewards/tag_count_reward/std": 0.16843141615390778, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 818.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 607.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 499.0, "completions/min_terminated_length": 0.0, "epoch": 0.08875992148160793, "frac_reward_zero_std": 0.0, "grad_norm": 48.81165937014762, "kl": 6.3671875, "learning_rate": 1.7713310580204778e-05, "loss": 0.2544, "num_tokens": 133008240.0, "reward": 0.5986328125, "reward_std": 0.17814429104328156, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5986328125, "rewards/tag_count_reward/std": 0.1806526482105255, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 938.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 618.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 476.0, "completions/min_terminated_length": 0.0, "epoch": 0.08893061363830332, "frac_reward_zero_std": 0.0, "grad_norm": 2.98777536265034, "kl": 4.484375, "learning_rate": 1.7747440273037545e-05, "loss": 0.1793, "num_tokens": 133206480.0, "reward": 0.6201171875, "reward_std": 0.16828574240207672, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6201171875, "rewards/tag_count_reward/std": 0.16854506731033325, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 820.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 697.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 575.0, "completions/min_terminated_length": 0.0, "epoch": 0.08910130579499873, "frac_reward_zero_std": 0.0, "grad_norm": 1.1437346049094028, "kl": 3.9765625, "learning_rate": 1.778156996587031e-05, "loss": 0.1589, "num_tokens": 133424976.0, "reward": 0.466796875, "reward_std": 0.21435365080833435, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.462890625, "rewards/tag_count_reward/std": 0.2079070508480072, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1057.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 846.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 541.0, "completions/min_terminated_length": 0.0, "epoch": 0.08927199795169412, "frac_reward_zero_std": 0.0, "grad_norm": 1.3519186576878224, "kl": 3.28515625, "learning_rate": 1.7815699658703073e-05, "loss": 0.1315, "num_tokens": 133677824.0, "reward": 0.509765625, "reward_std": 0.20203769207000732, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.509765625, "rewards/tag_count_reward/std": 0.20025068521499634, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1118.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1029.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 912.0, "completions/min_terminated_length": 0.0, "epoch": 0.08944269010838952, "frac_reward_zero_std": 0.0, "grad_norm": 0.8530392449602675, "kl": 2.83984375, "learning_rate": 1.7849829351535836e-05, "loss": 0.1136, "num_tokens": 133985600.0, "reward": 0.5390625, "reward_std": 0.18879291415214539, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.53125, "rewards/tag_count_reward/std": 0.17712298035621643, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1308.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1150.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 1004.0, "completions/min_terminated_length": 0.0, "epoch": 0.08961338226508492, "frac_reward_zero_std": 0.0, "grad_norm": 9.773744288432944, "kl": 3.8125, "learning_rate": 1.78839590443686e-05, "loss": 0.1529, "num_tokens": 134316736.0, "reward": 0.568359375, "reward_std": 0.20648206770420074, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.548828125, "rewards/tag_count_reward/std": 0.1772850602865219, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1514.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1201.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 1069.0, "completions/min_terminated_length": 0.0, "epoch": 0.08978407442178032, "frac_reward_zero_std": 0.0, "grad_norm": 18.5608509802387, "kl": 4.82421875, "learning_rate": 1.7918088737201367e-05, "loss": 0.1926, "num_tokens": 134667056.0, "reward": 0.6611328125, "reward_std": 0.20240817964076996, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6259765625, "rewards/tag_count_reward/std": 0.17006061971187592, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1869.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1502.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 1287.0, "completions/min_terminated_length": 0.0, "epoch": 0.08995476657847572, "frac_reward_zero_std": 0.0, "grad_norm": 2.638640637020972, "kl": 2.42578125, "learning_rate": 1.795221843003413e-05, "loss": 0.0968, "num_tokens": 135097840.0, "reward": 0.619140625, "reward_std": 0.22051817178726196, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.587890625, "rewards/tag_count_reward/std": 0.18810157477855682, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1518.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1287.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 1026.0, "completions/min_terminated_length": 0.0, "epoch": 0.09012545873517112, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4617797759222287, "kl": 2.375, "learning_rate": 1.7986348122866895e-05, "loss": 0.0948, "num_tokens": 135470464.0, "reward": 0.681640625, "reward_std": 0.13422900438308716, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.677734375, "rewards/tag_count_reward/std": 0.1334032118320465, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1433.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1128.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 818.0, "completions/min_terminated_length": 0.0, "epoch": 0.09029615089186652, "frac_reward_zero_std": 0.0625, "grad_norm": 1.8060634493501901, "kl": 2.71484375, "learning_rate": 1.802047781569966e-05, "loss": 0.1085, "num_tokens": 135803504.0, "reward": 0.7216796875, "reward_std": 0.12959425151348114, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7099609375, "rewards/tag_count_reward/std": 0.10668075084686279, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1309.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 979.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 746.0, "completions/min_terminated_length": 0.0, "epoch": 0.09046684304856192, "frac_reward_zero_std": 0.375, "grad_norm": 0.39528764956654566, "kl": 2.984375, "learning_rate": 1.8054607508532426e-05, "loss": 0.1195, "num_tokens": 136097408.0, "reward": 0.744140625, "reward_std": 0.08161579072475433, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.732421875, "rewards/tag_count_reward/std": 0.06776251643896103, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1297.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1045.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 836.0, "completions/min_terminated_length": 0.0, "epoch": 0.09063753520525732, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7516058195631491, "kl": 3.21484375, "learning_rate": 1.808873720136519e-05, "loss": 0.1286, "num_tokens": 136404720.0, "reward": 0.7314453125, "reward_std": 0.07482551783323288, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7275390625, "rewards/tag_count_reward/std": 0.08124882727861404, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1801.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1144.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 833.0, "completions/min_terminated_length": 0.0, "epoch": 0.09080822736195272, "frac_reward_zero_std": 0.3125, "grad_norm": 0.34605760864467733, "kl": 3.3359375, "learning_rate": 1.8122866894197953e-05, "loss": 0.1333, "num_tokens": 136739088.0, "reward": 0.75390625, "reward_std": 0.1351715326309204, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.11173487454652786, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1860.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1281.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 857.0, "completions/min_terminated_length": 0.0, "epoch": 0.09097891951864812, "frac_reward_zero_std": 0.25, "grad_norm": 0.33235267946382585, "kl": 3.45703125, "learning_rate": 1.8156996587030717e-05, "loss": 0.1383, "num_tokens": 137111360.0, "reward": 0.7783203125, "reward_std": 0.1484857201576233, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7314453125, "rewards/tag_count_reward/std": 0.08223269879817963, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 860.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 706.0, "completions/min_terminated_length": 0.0, "epoch": 0.09114961167534352, "frac_reward_zero_std": 0.5, "grad_norm": 0.6370461416212065, "kl": 3.5859375, "learning_rate": 1.819112627986348e-05, "loss": 0.1434, "num_tokens": 137371088.0, "reward": 0.7802734375, "reward_std": 0.1089666485786438, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.05928463488817215, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1231.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1019.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 684.0, "completions/min_terminated_length": 0.0, "epoch": 0.09132030383203892, "frac_reward_zero_std": 0.125, "grad_norm": 0.7315523078637951, "kl": 3.296875, "learning_rate": 1.8225255972696248e-05, "loss": 0.1319, "num_tokens": 137670528.0, "reward": 0.7919921875, "reward_std": 0.15981470048427582, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7138671875, "rewards/tag_count_reward/std": 0.11679161339998245, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1313.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1134.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 854.0, "completions/min_terminated_length": 0.0, "epoch": 0.09149099598873432, "frac_reward_zero_std": 0.0, "grad_norm": 1.0654640994001945, "kl": 3.03125, "learning_rate": 1.8259385665529012e-05, "loss": 0.1214, "num_tokens": 138004832.0, "reward": 0.8173828125, "reward_std": 0.23771905899047852, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8056640625, "rewards/tag_count_reward/std": 0.21022199094295502, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1877.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1456.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1011.0, "completions/min_terminated_length": 0.0, "epoch": 0.09166168814542971, "frac_reward_zero_std": 0.0, "grad_norm": 1.1341008649215516, "kl": 3.44140625, "learning_rate": 1.8293515358361776e-05, "loss": 0.1376, "num_tokens": 138413488.0, "reward": 0.859375, "reward_std": 0.19206112623214722, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.13625776767730713, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2014.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1631.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 1261.0, "completions/min_terminated_length": 0.0, "epoch": 0.09183238030212512, "frac_reward_zero_std": 0.0, "grad_norm": 19.907311910973284, "kl": 4.1015625, "learning_rate": 1.832764505119454e-05, "loss": 0.1641, "num_tokens": 138873664.0, "reward": 0.701171875, "reward_std": 0.24599626660346985, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.662109375, "rewards/tag_count_reward/std": 0.17874795198440552, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1595.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1097.0, "completions/min_terminated_length": 0.0, "epoch": 0.09200307245882051, "frac_reward_zero_std": 0.0, "grad_norm": 0.6523342711779226, "kl": 2.72265625, "learning_rate": 1.8361774744027307e-05, "loss": 0.1089, "num_tokens": 139320784.0, "reward": 0.583984375, "reward_std": 0.22247332334518433, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.525390625, "rewards/tag_count_reward/std": 0.17667919397354126, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1641.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 1116.0, "completions/min_terminated_length": 0.0, "epoch": 0.09217376461551592, "frac_reward_zero_std": 0.0, "grad_norm": 13.08675221005071, "kl": 2.6875, "learning_rate": 1.839590443686007e-05, "loss": 0.1073, "num_tokens": 139779232.0, "reward": 0.6845703125, "reward_std": 0.2463286817073822, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6494140625, "rewards/tag_count_reward/std": 0.21440672874450684, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1896.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1657.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 1190.0, "completions/min_terminated_length": 0.0, "epoch": 0.09234445677221131, "frac_reward_zero_std": 0.0, "grad_norm": 1.170326771683335, "kl": 2.3046875, "learning_rate": 1.8430034129692834e-05, "loss": 0.092, "num_tokens": 140240192.0, "reward": 0.69140625, "reward_std": 0.209904283285141, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22781464457511902, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.63671875, "rewards/tag_count_reward/std": 0.2131909281015396, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1769.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1442.0, "completions/min_terminated_length": 0.0, "epoch": 0.09251514892890672, "frac_reward_zero_std": 0.0, "grad_norm": 4.127795023550303, "kl": 2.1640625, "learning_rate": 1.8464163822525598e-05, "loss": 0.0866, "num_tokens": 140735920.0, "reward": 0.7685546875, "reward_std": 0.2374771535396576, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7412109375, "rewards/tag_count_reward/std": 0.22067387402057648, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1886.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1511.0, "completions/min_terminated_length": 0.0, "epoch": 0.09268584108560211, "frac_reward_zero_std": 0.0, "grad_norm": 3.1011019585968915, "kl": 2.35546875, "learning_rate": 1.8498293515358362e-05, "loss": 0.0944, "num_tokens": 141257472.0, "reward": 0.7880859375, "reward_std": 0.2068866789340973, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7646484375, "rewards/tag_count_reward/std": 0.18794625997543335, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1828.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 1607.0, "completions/min_terminated_length": 0.0, "epoch": 0.09285653324229752, "frac_reward_zero_std": 0.0, "grad_norm": 1.3787990080353754, "kl": 1.654296875, "learning_rate": 1.853242320819113e-05, "loss": 0.0662, "num_tokens": 141766656.0, "reward": 0.705078125, "reward_std": 0.23582246899604797, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.693359375, "rewards/tag_count_reward/std": 0.22625432908535004, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1995.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 1731.0, "completions/min_terminated_length": 0.0, "epoch": 0.09302722539899291, "frac_reward_zero_std": 0.0, "grad_norm": 0.49793521543467506, "kl": 1.515625, "learning_rate": 1.8566552901023893e-05, "loss": 0.0606, "num_tokens": 142319680.0, "reward": 0.7275390625, "reward_std": 0.19052016735076904, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7275390625, "rewards/tag_count_reward/std": 0.1936071366071701, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1617.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1285.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 1043.0, "completions/min_terminated_length": 0.0, "epoch": 0.09319791755568832, "frac_reward_zero_std": 0.0, "grad_norm": 2.6601180600900878, "kl": 1.859375, "learning_rate": 1.8600682593856656e-05, "loss": 0.0744, "num_tokens": 142690144.0, "reward": 0.771484375, "reward_std": 0.20999275147914886, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.748046875, "rewards/tag_count_reward/std": 0.19044862687587738, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1030.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 880.0, "completions/min_terminated_length": 0.0, "epoch": 0.09336860971238371, "frac_reward_zero_std": 0.0, "grad_norm": 2.578306862297225, "kl": 1.83984375, "learning_rate": 1.863481228668942e-05, "loss": 0.0734, "num_tokens": 142991920.0, "reward": 0.73046875, "reward_std": 0.19393321871757507, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.19158139824867249, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1134.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1005.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 795.0, "completions/min_terminated_length": 0.0, "epoch": 0.09353930186907912, "frac_reward_zero_std": 0.0, "grad_norm": 2.02166951744111, "kl": 1.751953125, "learning_rate": 1.8668941979522187e-05, "loss": 0.07, "num_tokens": 143288272.0, "reward": 0.7626953125, "reward_std": 0.22299589216709137, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.1564512401819229, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7392578125, "rewards/tag_count_reward/std": 0.20800602436065674, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1113.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 956.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 760.0, "completions/min_terminated_length": 0.0, "epoch": 0.09370999402577451, "frac_reward_zero_std": 0.0, "grad_norm": 3.2392962540181616, "kl": 1.65234375, "learning_rate": 1.870307167235495e-05, "loss": 0.0661, "num_tokens": 143577856.0, "reward": 0.8564453125, "reward_std": 0.20581382513046265, "rewards/accuracy_reward/mean": 0.02083333395421505, "rewards/accuracy_reward/std": 0.14312462508678436, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8369140625, "rewards/tag_count_reward/std": 0.18527840077877045, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1002.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 901.0, "completions/min_terminated_length": 0.0, "epoch": 0.09388068618246992, "frac_reward_zero_std": 0.0, "grad_norm": 1.5681082100496124, "kl": 1.685546875, "learning_rate": 1.8737201365187715e-05, "loss": 0.0674, "num_tokens": 143878368.0, "reward": 0.9189453125, "reward_std": 0.17884743213653564, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9033203125, "rewards/tag_count_reward/std": 0.157114639878273, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1274.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1087.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 968.0, "completions/min_terminated_length": 0.0, "epoch": 0.09405137833916531, "frac_reward_zero_std": 0.0, "grad_norm": 0.8835629549444775, "kl": 1.681640625, "learning_rate": 1.877133105802048e-05, "loss": 0.0672, "num_tokens": 144196384.0, "reward": 0.91015625, "reward_std": 0.17953447997570038, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.15611514449119568, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1464.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 1263.0, "completions/min_terminated_length": 0.0, "epoch": 0.09422207049586072, "frac_reward_zero_std": 0.0, "grad_norm": 10.804455915482382, "kl": 2.5078125, "learning_rate": 1.8805460750853242e-05, "loss": 0.1001, "num_tokens": 144609664.0, "reward": 0.818359375, "reward_std": 0.20990046858787537, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.775390625, "rewards/tag_count_reward/std": 0.1637185662984848, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1625.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1434.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1058.0, "completions/min_terminated_length": 0.0, "epoch": 0.09439276265255611, "frac_reward_zero_std": 0.0, "grad_norm": 6.527207618098263, "kl": 1.91015625, "learning_rate": 1.883959044368601e-05, "loss": 0.0764, "num_tokens": 145017648.0, "reward": 0.7412109375, "reward_std": 0.1689807027578354, "rewards/accuracy_reward/mean": 0.04583333432674408, "rewards/accuracy_reward/std": 0.2095605432987213, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6982421875, "rewards/tag_count_reward/std": 0.1062130555510521, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1970.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 1645.0, "completions/min_terminated_length": 0.0, "epoch": 0.09456345480925152, "frac_reward_zero_std": 0.0, "grad_norm": 24.580025103184195, "kl": 3.4375, "learning_rate": 1.8873720136518773e-05, "loss": 0.1375, "num_tokens": 145561040.0, "reward": 0.67578125, "reward_std": 0.1605791300535202, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.63671875, "rewards/tag_count_reward/std": 0.12856438755989075, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.09473414696594691, "frac_reward_zero_std": 0.0, "grad_norm": 2.3518242185752927, "kl": 0.7958984375, "learning_rate": 1.8907849829351537e-05, "loss": 0.0318, "num_tokens": 146123232.0, "reward": 0.3056640625, "reward_std": 0.1546381711959839, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2666015625, "rewards/tag_count_reward/std": 0.12706249952316284, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.09490483912264232, "frac_reward_zero_std": 0.0, "grad_norm": 0.9434230776775482, "kl": 0.50927734375, "learning_rate": 1.89419795221843e-05, "loss": 0.0204, "num_tokens": 146691200.0, "reward": 0.3076171875, "reward_std": 0.14288224279880524, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2958984375, "rewards/tag_count_reward/std": 0.1256074458360672, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.09507553127933771, "frac_reward_zero_std": 0.0, "grad_norm": 7.090378184333875, "kl": 0.5078125, "learning_rate": 1.8976109215017068e-05, "loss": 0.0203, "num_tokens": 147255792.0, "reward": 0.3505859375, "reward_std": 0.16216862201690674, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3232421875, "rewards/tag_count_reward/std": 0.13917416334152222, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2010.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1910.0, "completions/min_terminated_length": 0.0, "epoch": 0.09524622343603312, "frac_reward_zero_std": 0.0, "grad_norm": 1.1173602984960715, "kl": 0.9794921875, "learning_rate": 1.9010238907849832e-05, "loss": 0.0392, "num_tokens": 147825472.0, "reward": 0.0810546875, "reward_std": 0.11195363104343414, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0810546875, "rewards/tag_count_reward/std": 0.11724977940320969, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.09541691559272851, "frac_reward_zero_std": 0.0, "grad_norm": 0.9757304597506731, "kl": 1.26171875, "learning_rate": 1.9044368600682596e-05, "loss": 0.0504, "num_tokens": 148385440.0, "reward": 0.12890625, "reward_std": 0.1566573530435562, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1171875, "rewards/tag_count_reward/std": 0.14665386080741882, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1724.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1347.0, "completions/min_terminated_length": 0.0, "epoch": 0.09558760774942392, "frac_reward_zero_std": 0.0, "grad_norm": 4.035146029586121, "kl": 1.97265625, "learning_rate": 1.907849829351536e-05, "loss": 0.0789, "num_tokens": 148865536.0, "reward": 0.11328125, "reward_std": 0.16106773912906647, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.08984375, "rewards/tag_count_reward/std": 0.12018929421901703, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1355.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1051.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 772.0, "completions/min_terminated_length": 0.0, "epoch": 0.09575829990611931, "frac_reward_zero_std": 0.0, "grad_norm": 2.042165456197897, "kl": 2.26953125, "learning_rate": 1.9112627986348123e-05, "loss": 0.0908, "num_tokens": 149174096.0, "reward": 0.224609375, "reward_std": 0.22304649651050568, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.177734375, "rewards/tag_count_reward/std": 0.1616469919681549, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1340.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1097.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 856.0, "completions/min_terminated_length": 0.0, "epoch": 0.09592899206281472, "frac_reward_zero_std": 0.0, "grad_norm": 3.024391797250089, "kl": 2.41796875, "learning_rate": 1.914675767918089e-05, "loss": 0.0969, "num_tokens": 149496032.0, "reward": 0.2265625, "reward_std": 0.20243996381759644, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.20703125, "rewards/tag_count_reward/std": 0.18945065140724182, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1206.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 1048.0, "completions/min_terminated_length": 0.0, "epoch": 0.09609968421951011, "frac_reward_zero_std": 0.0, "grad_norm": 4.2449270926658516, "kl": 2.40625, "learning_rate": 1.918088737201365e-05, "loss": 0.0962, "num_tokens": 149841408.0, "reward": 0.3740234375, "reward_std": 0.21948769688606262, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3701171875, "rewards/tag_count_reward/std": 0.22134965658187866, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1324.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 1054.0, "completions/min_terminated_length": 0.0, "epoch": 0.09627037637620552, "frac_reward_zero_std": 0.0, "grad_norm": 4.437622069651732, "kl": 2.296875, "learning_rate": 1.9215017064846418e-05, "loss": 0.0921, "num_tokens": 150220256.0, "reward": 0.4140625, "reward_std": 0.2264309972524643, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.20276883244514465, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1611.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1279.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 1111.0, "completions/min_terminated_length": 0.0, "epoch": 0.09644106853290091, "frac_reward_zero_std": 0.0, "grad_norm": 7.191638946916784, "kl": 2.037109375, "learning_rate": 1.924914675767918e-05, "loss": 0.0814, "num_tokens": 150589984.0, "reward": 0.494140625, "reward_std": 0.25959864258766174, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.462890625, "rewards/tag_count_reward/std": 0.2302808314561844, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1430.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 1107.0, "completions/min_terminated_length": 0.0, "epoch": 0.09661176068959632, "frac_reward_zero_std": 0.0, "grad_norm": 1.7179095675298361, "kl": 1.806640625, "learning_rate": 1.928327645051195e-05, "loss": 0.0723, "num_tokens": 151008000.0, "reward": 0.5, "reward_std": 0.24789677560329437, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.46484375, "rewards/tag_count_reward/std": 0.23791244626045227, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1988.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1631.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 1394.0, "completions/min_terminated_length": 0.0, "epoch": 0.09678245284629171, "frac_reward_zero_std": 0.0, "grad_norm": 46.310231949535634, "kl": 3.951171875, "learning_rate": 1.9317406143344713e-05, "loss": 0.1584, "num_tokens": 151463520.0, "reward": 0.4296875, "reward_std": 0.3489266037940979, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.0703125, "rewards/format_reward/std": 0.2561737895011902, "rewards/tag_count_reward/mean": 0.34765625, "rewards/tag_count_reward/std": 0.2348015010356903, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1984.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1831.0, "completions/min_terminated_length": 0.0, "epoch": 0.09695314500298712, "frac_reward_zero_std": 0.0, "grad_norm": 2.626016070079555, "kl": 1.474609375, "learning_rate": 1.9351535836177476e-05, "loss": 0.059, "num_tokens": 152007728.0, "reward": 0.3056640625, "reward_std": 0.2903060019016266, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 0.01953125, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.2431640625, "rewards/tag_count_reward/std": 0.2283833920955658, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.09712383715968251, "frac_reward_zero_std": 0.0, "grad_norm": 7.379403613269036, "kl": 1.25390625, "learning_rate": 1.938566552901024e-05, "loss": 0.0501, "num_tokens": 152571616.0, "reward": 0.248046875, "reward_std": 0.26249998807907104, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.208984375, "rewards/tag_count_reward/std": 0.20834864675998688, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.09729452931637791, "frac_reward_zero_std": 0.0, "grad_norm": 11.367834885078345, "kl": 1.15625, "learning_rate": 1.9419795221843004e-05, "loss": 0.0462, "num_tokens": 153145328.0, "reward": 0.2578125, "reward_std": 0.2568889856338501, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.0234375, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.2265625, "rewards/tag_count_reward/std": 0.22565264999866486, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.09746522147307331, "frac_reward_zero_std": 0.0, "grad_norm": 10.234690590947361, "kl": 1.099609375, "learning_rate": 1.945392491467577e-05, "loss": 0.044, "num_tokens": 153712960.0, "reward": 0.294921875, "reward_std": 0.2942594885826111, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.05078125, "rewards/format_reward/std": 0.21998079121112823, "rewards/tag_count_reward/mean": 0.244140625, "rewards/tag_count_reward/std": 0.23107771575450897, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.09763591362976871, "frac_reward_zero_std": 0.0, "grad_norm": 10.177349986701312, "kl": 1.13671875, "learning_rate": 1.948805460750853e-05, "loss": 0.0455, "num_tokens": 154278880.0, "reward": 0.3173828125, "reward_std": 0.3098580837249756, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.24253563582897186, "rewards/tag_count_reward/mean": 0.2392578125, "rewards/tag_count_reward/std": 0.21034947037696838, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.09780660578646411, "frac_reward_zero_std": 0.0, "grad_norm": 9.957843283602427, "kl": 1.33203125, "learning_rate": 1.95221843003413e-05, "loss": 0.0533, "num_tokens": 154851296.0, "reward": 0.3408203125, "reward_std": 0.3137795329093933, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.05859375, "rewards/format_reward/std": 0.23532284796237946, "rewards/tag_count_reward/mean": 0.2822265625, "rewards/tag_count_reward/std": 0.24790598452091217, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.09797729794315951, "frac_reward_zero_std": 0.0, "grad_norm": 0.6906062455847473, "kl": 1.5390625, "learning_rate": 1.9556313993174062e-05, "loss": 0.0616, "num_tokens": 155420000.0, "reward": 0.32421875, "reward_std": 0.3159841001033783, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.05859375, "rewards/format_reward/std": 0.23532284796237946, "rewards/tag_count_reward/mean": 0.25390625, "rewards/tag_count_reward/std": 0.21232692897319794, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.0981479900998549, "frac_reward_zero_std": 0.0, "grad_norm": 0.6803990149503995, "kl": 1.240234375, "learning_rate": 1.959044368600683e-05, "loss": 0.0496, "num_tokens": 155985568.0, "reward": 0.4169921875, "reward_std": 0.3571562170982361, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.11328125, "rewards/format_reward/std": 0.31755712628364563, "rewards/tag_count_reward/mean": 0.2958984375, "rewards/tag_count_reward/std": 0.21600830554962158, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.09831868225655031, "frac_reward_zero_std": 0.0, "grad_norm": 1.61859231865027, "kl": 0.9521484375, "learning_rate": 1.9624573378839593e-05, "loss": 0.0381, "num_tokens": 156555520.0, "reward": 0.56640625, "reward_std": 0.4545474052429199, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.24609375, "rewards/format_reward/std": 0.43157756328582764, "rewards/tag_count_reward/mean": 0.31640625, "rewards/tag_count_reward/std": 0.22466616332530975, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.0984893744132457, "frac_reward_zero_std": 0.0, "grad_norm": 1.1402277415971542, "kl": 0.806640625, "learning_rate": 1.9658703071672357e-05, "loss": 0.0322, "num_tokens": 157122176.0, "reward": 0.7255859375, "reward_std": 0.49441808462142944, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.49209436774253845, "rewards/tag_count_reward/mean": 0.2958984375, "rewards/tag_count_reward/std": 0.1994914561510086, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.09866006656994111, "frac_reward_zero_std": 0.0, "grad_norm": 3.4635186992466673, "kl": 1.126953125, "learning_rate": 1.969283276450512e-05, "loss": 0.0451, "num_tokens": 157687440.0, "reward": 1.0078125, "reward_std": 0.46499696373939514, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.69921875, "rewards/format_reward/std": 0.45949608087539673, "rewards/tag_count_reward/mean": 0.28125, "rewards/tag_count_reward/std": 0.1714985966682434, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2015.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1788.0, "completions/min_terminated_length": 0.0, "epoch": 0.0988307587266365, "frac_reward_zero_std": 0.0, "grad_norm": 4.081225884121053, "kl": 1.236328125, "learning_rate": 1.9726962457337885e-05, "loss": 0.0494, "num_tokens": 158244752.0, "reward": 1.0048828125, "reward_std": 0.44527745246887207, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 0.68359375, "rewards/format_reward/std": 0.4659844934940338, "rewards/tag_count_reward/mean": 0.2783203125, "rewards/tag_count_reward/std": 0.19027511775493622, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.09900145088333191, "frac_reward_zero_std": 0.0, "grad_norm": 4.724500448781497, "kl": 1.525390625, "learning_rate": 1.9761092150170652e-05, "loss": 0.0611, "num_tokens": 158809088.0, "reward": 1.0107421875, "reward_std": 0.3638307452201843, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42443734407424927, "rewards/tag_count_reward/mean": 0.2451171875, "rewards/tag_count_reward/std": 0.1442546397447586, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1996.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 1737.0, "completions/min_terminated_length": 0.0, "epoch": 0.0991721430400273, "frac_reward_zero_std": 0.0, "grad_norm": 3.6414465451747446, "kl": 1.6171875, "learning_rate": 1.9795221843003412e-05, "loss": 0.0646, "num_tokens": 159359616.0, "reward": 1.1796875, "reward_std": 0.3069320321083069, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.91796875, "rewards/format_reward/std": 0.2749498784542084, "rewards/tag_count_reward/mean": 0.234375, "rewards/tag_count_reward/std": 0.13558153808116913, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2001.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 1859.0, "completions/min_terminated_length": 0.0, "epoch": 0.09934283519672271, "frac_reward_zero_std": 0.0, "grad_norm": 2.485685132151045, "kl": 1.6484375, "learning_rate": 1.982935153583618e-05, "loss": 0.0659, "num_tokens": 159915840.0, "reward": 1.16796875, "reward_std": 0.30576902627944946, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.92578125, "rewards/format_reward/std": 0.2626400291919708, "rewards/tag_count_reward/mean": 0.2265625, "rewards/tag_count_reward/std": 0.12302359193563461, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1777.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1408.0, "completions/min_terminated_length": 0.0, "epoch": 0.0995135273534181, "frac_reward_zero_std": 0.0, "grad_norm": 3.6097101378321503, "kl": 1.76171875, "learning_rate": 1.9863481228668943e-05, "loss": 0.0704, "num_tokens": 160410272.0, "reward": 1.138671875, "reward_std": 0.40498948097229004, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.84765625, "rewards/format_reward/std": 0.3600577116012573, "rewards/tag_count_reward/mean": 0.244140625, "rewards/tag_count_reward/std": 0.1190849244594574, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1890.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1451.0, "completions/min_terminated_length": 0.0, "epoch": 0.09968421951011351, "frac_reward_zero_std": 0.125, "grad_norm": 3.980373373601238, "kl": 1.712890625, "learning_rate": 1.989761092150171e-05, "loss": 0.0687, "num_tokens": 160936000.0, "reward": 1.1171875, "reward_std": 0.29475343227386475, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3483152687549591, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.10618149489164352, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1998.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1652.0, "completions/min_terminated_length": 0.0, "epoch": 0.0998549116668089, "frac_reward_zero_std": 0.0625, "grad_norm": 1.193118749834382, "kl": 1.455078125, "learning_rate": 1.993174061433447e-05, "loss": 0.0581, "num_tokens": 161485312.0, "reward": 1.1806640625, "reward_std": 0.31180715560913086, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.31272050738334656, "rewards/tag_count_reward/mean": 0.2509765625, "rewards/tag_count_reward/std": 0.09522431343793869, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1779.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1412.0, "completions/min_terminated_length": 0.0, "epoch": 0.10002560382350431, "frac_reward_zero_std": 0.0, "grad_norm": 1.5451894020223713, "kl": 1.732421875, "learning_rate": 1.9965870307167238e-05, "loss": 0.0693, "num_tokens": 161984768.0, "reward": 1.216796875, "reward_std": 0.27393966913223267, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.9296875, "rewards/format_reward/std": 0.2561737895011902, "rewards/tag_count_reward/mean": 0.259765625, "rewards/tag_count_reward/std": 0.09342263638973236, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1855.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 1369.0, "completions/min_terminated_length": 0.0, "epoch": 0.1001962959801997, "frac_reward_zero_std": 0.375, "grad_norm": 3.3647179657647386, "kl": 1.6015625, "learning_rate": 2e-05, "loss": 0.064, "num_tokens": 162495568.0, "reward": 1.26953125, "reward_std": 0.1950579285621643, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03814799711108208, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1706.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1353.0, "completions/min_terminated_length": 0.0, "epoch": 0.10036698813689511, "frac_reward_zero_std": 0.1875, "grad_norm": 6.724932418370825, "kl": 1.892578125, "learning_rate": 1.9999998225180493e-05, "loss": 0.0757, "num_tokens": 162979472.0, "reward": 1.2548828125, "reward_std": 0.11705857515335083, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.2666015625, "rewards/tag_count_reward/std": 0.08838293701410294, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1462.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 1097.0, "completions/min_terminated_length": 0.0, "epoch": 0.1005376802935905, "frac_reward_zero_std": 0.0, "grad_norm": 6.274035991783945, "kl": 1.89453125, "learning_rate": 1.999999290072259e-05, "loss": 0.0758, "num_tokens": 163387408.0, "reward": 1.2724609375, "reward_std": 0.22743982076644897, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.20318391919136047, "rewards/tag_count_reward/mean": 0.3154296875, "rewards/tag_count_reward/std": 0.14302821457386017, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1599.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 1069.0, "completions/min_terminated_length": 0.0, "epoch": 0.10070837245028591, "frac_reward_zero_std": 0.0, "grad_norm": 9.9393731230103, "kl": 2.2734375, "learning_rate": 1.999998402662819e-05, "loss": 0.0911, "num_tokens": 163842784.0, "reward": 1.3525390625, "reward_std": 0.31006208062171936, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.93359375, "rewards/format_reward/std": 0.24947863817214966, "rewards/tag_count_reward/mean": 0.4072265625, "rewards/tag_count_reward/std": 0.15635719895362854, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1860.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1364.0, "completions/min_terminated_length": 0.0, "epoch": 0.1008790646069813, "frac_reward_zero_std": 0.0, "grad_norm": 2.447968629985816, "kl": 1.8046875, "learning_rate": 1.9999971602900436e-05, "loss": 0.0721, "num_tokens": 164357136.0, "reward": 1.21484375, "reward_std": 0.4969516396522522, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.7734375, "rewards/format_reward/std": 0.41942715644836426, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.13826657831668854, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1826.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1233.0, "completions/min_terminated_length": 0.0, "epoch": 0.10104975676367671, "frac_reward_zero_std": 0.0, "grad_norm": 0.48526299869522616, "kl": 1.890625, "learning_rate": 1.999995562954374e-05, "loss": 0.0757, "num_tokens": 164863984.0, "reward": 1.34765625, "reward_std": 0.5060468316078186, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 0.8359375, "rewards/format_reward/std": 0.3710577189922333, "rewards/tag_count_reward/mean": 0.4453125, "rewards/tag_count_reward/std": 0.1288619190454483, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1920.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1547.0, "completions/min_terminated_length": 0.0, "epoch": 0.1012204489203721, "frac_reward_zero_std": 0.0, "grad_norm": 8.62619978748104, "kl": 1.609375, "learning_rate": 1.999993610656378e-05, "loss": 0.0644, "num_tokens": 165396128.0, "reward": 1.3935546875, "reward_std": 0.2730828821659088, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.20318391919136047, "rewards/tag_count_reward/mean": 0.4052734375, "rewards/tag_count_reward/std": 0.1643809676170349, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2043.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 2009.0, "completions/min_terminated_length": 0.0, "epoch": 0.10139114107706751, "frac_reward_zero_std": 0.0, "grad_norm": 7.082224867668581, "kl": 1.486328125, "learning_rate": 1.999991303396747e-05, "loss": 0.0595, "num_tokens": 165962224.0, "reward": 1.37109375, "reward_std": 0.24835151433944702, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.1579200029373169, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1998.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 1838.0, "completions/min_terminated_length": 0.0, "epoch": 0.1015618332337629, "frac_reward_zero_std": 0.0, "grad_norm": 8.999654134643597, "kl": 1.58984375, "learning_rate": 1.9999886411763017e-05, "loss": 0.0636, "num_tokens": 166513392.0, "reward": 1.40234375, "reward_std": 0.26082542538642883, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.1783296763896942, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1956.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 1786.0, "completions/min_terminated_length": 0.0, "epoch": 0.10173252539045831, "frac_reward_zero_std": 0.0, "grad_norm": 4.7708744717407345, "kl": 1.615234375, "learning_rate": 1.999985623995986e-05, "loss": 0.0646, "num_tokens": 167053840.0, "reward": 1.451171875, "reward_std": 0.3468436002731323, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.396484375, "rewards/tag_count_reward/std": 0.20775963366031647, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1965.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1689.0, "completions/min_terminated_length": 0.0, "epoch": 0.1019032175471537, "frac_reward_zero_std": 0.0, "grad_norm": 6.36907348469765, "kl": 1.654296875, "learning_rate": 1.9999822518568713e-05, "loss": 0.0661, "num_tokens": 167603392.0, "reward": 1.5107421875, "reward_std": 0.3024976849555969, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.4912109375, "rewards/tag_count_reward/std": 0.23983430862426758, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1602.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 1140.0, "completions/min_terminated_length": 0.0, "epoch": 0.10207390970384911, "frac_reward_zero_std": 0.0, "grad_norm": 5.568632870443633, "kl": 2.12890625, "learning_rate": 1.999978524760154e-05, "loss": 0.0852, "num_tokens": 168053312.0, "reward": 1.5966796875, "reward_std": 0.3265994191169739, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.6083984375, "rewards/tag_count_reward/std": 0.21629177033901215, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1552.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 976.0, "completions/min_terminated_length": 0.0, "epoch": 0.1022446018605445, "frac_reward_zero_std": 0.0, "grad_norm": 1.9582688469816405, "kl": 2.4609375, "learning_rate": 1.999974442707158e-05, "loss": 0.0982, "num_tokens": 168489344.0, "reward": 1.69140625, "reward_std": 0.3407438397407532, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.7109375, "rewards/tag_count_reward/std": 0.23626485466957092, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1796.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1341.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 1021.0, "completions/min_terminated_length": 0.0, "epoch": 0.10241529401723991, "frac_reward_zero_std": 0.0, "grad_norm": 1.9088582542941746, "kl": 2.87109375, "learning_rate": 1.999970005699332e-05, "loss": 0.1146, "num_tokens": 168880848.0, "reward": 1.8642578125, "reward_std": 0.26353919506073, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.8642578125, "rewards/tag_count_reward/std": 0.18755872547626495, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1235.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 941.0, "completions/min_terminated_length": 0.0, "epoch": 0.10258598617393531, "frac_reward_zero_std": 0.0, "grad_norm": 4.609999074966102, "kl": 2.9609375, "learning_rate": 1.9999652137382506e-05, "loss": 0.1185, "num_tokens": 169237808.0, "reward": 1.806640625, "reward_std": 0.4162787199020386, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.89453125, "rewards/format_reward/std": 0.3077581524848938, "rewards/tag_count_reward/mean": 0.900390625, "rewards/tag_count_reward/std": 0.19886897504329681, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1394.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1090.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 862.0, "completions/min_terminated_length": 0.0, "epoch": 0.10275667833063071, "frac_reward_zero_std": 0.125, "grad_norm": 70.54775307958599, "kl": 8.28125, "learning_rate": 1.9999600668256148e-05, "loss": 0.332, "num_tokens": 169556416.0, "reward": 1.814453125, "reward_std": 0.49247217178344727, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.86328125, "rewards/format_reward/std": 0.34422317147254944, "rewards/tag_count_reward/mean": 0.916015625, "rewards/tag_count_reward/std": 0.20478908717632294, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1264.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1058.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 844.0, "completions/min_terminated_length": 0.0, "epoch": 0.10292737048732611, "frac_reward_zero_std": 0.0, "grad_norm": 0.4122987467756874, "kl": 3.27734375, "learning_rate": 1.9999545649632522e-05, "loss": 0.131, "num_tokens": 169865472.0, "reward": 1.72265625, "reward_std": 0.6294344067573547, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 0.75390625, "rewards/format_reward/std": 0.43157756328582764, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.22673621773719788, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1184.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 906.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 720.0, "completions/min_terminated_length": 0.0, "epoch": 0.10309806264402151, "frac_reward_zero_std": 0.0, "grad_norm": 0.5903925224671511, "kl": 3.1953125, "learning_rate": 1.9999487081531148e-05, "loss": 0.1277, "num_tokens": 170142848.0, "reward": 1.8828125, "reward_std": 0.3756501376628876, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.90234375, "rewards/format_reward/std": 0.29743078351020813, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.08984941244125366, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1137.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 866.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 638.0, "completions/min_terminated_length": 0.0, "epoch": 0.10326875480071691, "frac_reward_zero_std": 0.1875, "grad_norm": 75.59917844949977, "kl": 6.9296875, "learning_rate": 1.9999424963972824e-05, "loss": 0.2774, "num_tokens": 170422656.0, "reward": 1.9453125, "reward_std": 0.24238507449626923, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21178513765335083, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.0990147590637207, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1316.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 906.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 649.0, "completions/min_terminated_length": 0.0, "epoch": 0.1034394469574123, "frac_reward_zero_std": 0.25, "grad_norm": 0.7609272416817837, "kl": 3.16796875, "learning_rate": 1.9999359296979595e-05, "loss": 0.1267, "num_tokens": 170695696.0, "reward": 1.9619140625, "reward_std": 0.1980436146259308, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.9775390625, "rewards/tag_count_reward/std": 0.07817400991916656, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 809.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 629.0, "completions/min_terminated_length": 0.0, "epoch": 0.10361013911410771, "frac_reward_zero_std": 0.3125, "grad_norm": 1.4007467755590917, "kl": 2.6328125, "learning_rate": 1.9999290080574774e-05, "loss": 0.1051, "num_tokens": 170945392.0, "reward": 1.98828125, "reward_std": 0.15075168013572693, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0842171311378479, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1037808312708031, "frac_reward_zero_std": 0.0, "grad_norm": 38.848248974740564, "kl": 1.685546875, "learning_rate": 1.999921731478293e-05, "loss": 0.0675, "num_tokens": 171508896.0, "reward": 1.7900390625, "reward_std": 0.22117257118225098, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21998079121112823, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.7509765625, "rewards/tag_count_reward/std": 0.15418675541877747, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.10395152342749851, "frac_reward_zero_std": 0.0625, "grad_norm": 1.964158567100819, "kl": 0.39794921875, "learning_rate": 1.9999140999629882e-05, "loss": 0.0159, "num_tokens": 172068400.0, "reward": 1.615234375, "reward_std": 0.303481787443161, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.91796875, "rewards/format_reward/std": 0.2749498784542084, "rewards/tag_count_reward/mean": 0.685546875, "rewards/tag_count_reward/std": 0.137363463640213, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1041222155841939, "frac_reward_zero_std": 0.0, "grad_norm": 0.3118259635193676, "kl": 0.3955078125, "learning_rate": 1.9999061135142734e-05, "loss": 0.0158, "num_tokens": 172636560.0, "reward": 1.6083984375, "reward_std": 0.31111860275268555, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2920515835285187, "rewards/tag_count_reward/mean": 0.7021484375, "rewards/tag_count_reward/std": 0.10574328154325485, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.10429290774088931, "frac_reward_zero_std": 0.125, "grad_norm": 0.32109065063823317, "kl": 0.39111328125, "learning_rate": 1.9998977721349826e-05, "loss": 0.0156, "num_tokens": 173203888.0, "reward": 1.701171875, "reward_std": 0.16034698486328125, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.720703125, "rewards/tag_count_reward/std": 0.08643876761198044, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1044635998975847, "frac_reward_zero_std": 0.1875, "grad_norm": 8.450373065881378, "kl": 1.23291015625, "learning_rate": 1.9998890758280773e-05, "loss": 0.0492, "num_tokens": 173779696.0, "reward": 1.7509765625, "reward_std": 0.19039931893348694, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21998079121112823, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.7236328125, "rewards/tag_count_reward/std": 0.08006177842617035, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.10463429205428011, "frac_reward_zero_std": 0.0625, "grad_norm": 6.938281247611859, "kl": 1.00537109375, "learning_rate": 1.9998800245966434e-05, "loss": 0.0402, "num_tokens": 174341888.0, "reward": 1.7373046875, "reward_std": 0.17111045122146606, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.21840041875839233, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.7099609375, "rewards/tag_count_reward/std": 0.09449763596057892, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 2041.19140625, "completions/mean_terminated_length": 305.0, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.1048049842109755, "frac_reward_zero_std": 0.0625, "grad_norm": 2.379328392769953, "kl": 0.4453125, "learning_rate": 1.9998706184438946e-05, "loss": 0.0048, "num_tokens": 174904417.0, "reward": 1.6787109375, "reward_std": 0.15066847205162048, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.6982421875, "rewards/tag_count_reward/std": 0.10387981683015823, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.10497567636767091, "frac_reward_zero_std": 0.0, "grad_norm": 1.2777841376112542, "kl": 0.37255859375, "learning_rate": 1.9998608573731696e-05, "loss": 0.0149, "num_tokens": 175471377.0, "reward": 1.6845703125, "reward_std": 0.2762886881828308, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.93359375, "rewards/format_reward/std": 0.24947863817214966, "rewards/tag_count_reward/mean": 0.7158203125, "rewards/tag_count_reward/std": 0.08605579286813736, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1051463685243663, "frac_reward_zero_std": 0.125, "grad_norm": 1.9634427933724818, "kl": 0.484375, "learning_rate": 1.9998507413879333e-05, "loss": 0.0194, "num_tokens": 176038593.0, "reward": 1.71484375, "reward_std": 0.22562292218208313, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.08284168690443039, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.10531706068106171, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6319661123814697, "kl": 0.78125, "learning_rate": 1.999840270491776e-05, "loss": 0.0312, "num_tokens": 176609409.0, "reward": 1.6640625, "reward_std": 0.24377846717834473, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24253563582897186, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.0857492983341217, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1054877528377571, "frac_reward_zero_std": 0.3125, "grad_norm": 0.18311182951541943, "kl": 0.9345703125, "learning_rate": 1.9998294446884152e-05, "loss": 0.0374, "num_tokens": 177169553.0, "reward": 1.7138671875, "reward_std": 0.1640871912240982, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.7333984375, "rewards/tag_count_reward/std": 0.062369659543037415, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.10565844499445251, "frac_reward_zero_std": 0.0625, "grad_norm": 0.20947723237368632, "kl": 1.412109375, "learning_rate": 1.999818263981693e-05, "loss": 0.0565, "num_tokens": 177734849.0, "reward": 1.7490234375, "reward_std": 0.22855833172798157, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21998079121112823, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.7255859375, "rewards/tag_count_reward/std": 0.0775839164853096, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1058291371511479, "frac_reward_zero_std": 0.125, "grad_norm": 0.2292089197936045, "kl": 2.2734375, "learning_rate": 1.9998067283755787e-05, "loss": 0.0908, "num_tokens": 178296081.0, "reward": 1.78515625, "reward_std": 0.1981242597103119, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2561737895011902, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.07739239931106567, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.10599982930784331, "frac_reward_zero_std": 0.25, "grad_norm": 0.2924409729584206, "kl": 2.50390625, "learning_rate": 1.9997948378741663e-05, "loss": 0.1001, "num_tokens": 178864881.0, "reward": 1.71484375, "reward_std": 0.17618334293365479, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.08247103542089462, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1061705214645387, "frac_reward_zero_std": 0.1875, "grad_norm": 0.47518757845682025, "kl": 2.55078125, "learning_rate": 1.9997825924816774e-05, "loss": 0.1021, "num_tokens": 179429457.0, "reward": 1.7177734375, "reward_std": 0.15613043308258057, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.05928463488817215, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.10634121362123411, "frac_reward_zero_std": 0.25, "grad_norm": 0.35975905459091745, "kl": 2.169921875, "learning_rate": 1.9997699922024583e-05, "loss": 0.0867, "num_tokens": 179993313.0, "reward": 1.71484375, "reward_std": 0.16066521406173706, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.07077564299106598, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1065119057779295, "frac_reward_zero_std": 0.3125, "grad_norm": 0.2977060580519479, "kl": 1.791015625, "learning_rate": 1.9997570370409813e-05, "loss": 0.0716, "num_tokens": 180558209.0, "reward": 1.7177734375, "reward_std": 0.13118445873260498, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.7333984375, "rewards/tag_count_reward/std": 0.06978800892829895, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.10668259793462491, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8972267837428408, "kl": 1.55859375, "learning_rate": 1.9997437270018454e-05, "loss": 0.0624, "num_tokens": 181121169.0, "reward": 1.6787109375, "reward_std": 0.20922282338142395, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.6630859375, "rewards/tag_count_reward/std": 0.16122889518737793, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1068532900913203, "frac_reward_zero_std": 0.0, "grad_norm": 0.48820972280049146, "kl": 1.80078125, "learning_rate": 1.999730062089775e-05, "loss": 0.0721, "num_tokens": 181684561.0, "reward": 1.62109375, "reward_std": 0.43702858686447144, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3638034462928772, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.09450270235538483, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.10702398224801571, "frac_reward_zero_std": 0.0, "grad_norm": 0.7340457460874638, "kl": 2.203125, "learning_rate": 1.9997160423096213e-05, "loss": 0.0881, "num_tokens": 182254465.0, "reward": 1.5087890625, "reward_std": 0.4994441270828247, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.7421875, "rewards/format_reward/std": 0.4382871091365814, "rewards/tag_count_reward/mean": 0.7470703125, "rewards/tag_count_reward/std": 0.1443077176809311, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1328.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1082.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 801.0, "completions/min_terminated_length": 0.0, "epoch": 0.1071946744047111, "frac_reward_zero_std": 0.0, "grad_norm": 1.8134561319244862, "kl": 3.31640625, "learning_rate": 1.9997016676663595e-05, "loss": 0.1326, "num_tokens": 182570337.0, "reward": 1.6103515625, "reward_std": 0.6418291926383972, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21998079121112823, "rewards/format_reward/mean": 0.66796875, "rewards/format_reward/std": 0.4718646705150604, "rewards/tag_count_reward/mean": 0.8916015625, "rewards/tag_count_reward/std": 0.17069000005722046, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2045.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 2027.0, "completions/min_terminated_length": 0.0, "epoch": 0.10736536656140651, "frac_reward_zero_std": 0.0, "grad_norm": 2.027247677806578, "kl": 2.8515625, "learning_rate": 1.9996869381650935e-05, "loss": 0.1139, "num_tokens": 183132945.0, "reward": 1.685546875, "reward_std": 0.5799033045768738, "rewards/accuracy_reward/mean": 0.01666666753590107, "rewards/accuracy_reward/std": 0.12828664481639862, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.3910769522190094, "rewards/tag_count_reward/mean": 0.857421875, "rewards/tag_count_reward/std": 0.21278636157512665, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1951.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 1273.0, "completions/min_terminated_length": 0.0, "epoch": 0.1075360587181019, "frac_reward_zero_std": 0.0, "grad_norm": 0.4493988064788792, "kl": 2.8984375, "learning_rate": 1.9996718538110508e-05, "loss": 0.1159, "num_tokens": 183678961.0, "reward": 0.4296875, "reward_std": 0.42115336656570435, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.24253563582897186, "rewards/tag_count_reward/mean": 0.33984375, "rewards/tag_count_reward/std": 0.2185836285352707, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.10770675087479731, "frac_reward_zero_std": 0.0, "grad_norm": 1.9526840830536671, "kl": 2.390625, "learning_rate": 1.999656414609586e-05, "loss": 0.0956, "num_tokens": 184246785.0, "reward": 1.390625, "reward_std": 0.7740119695663452, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29743078351020813, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5009794235229492, "rewards/tag_count_reward/mean": 0.79296875, "rewards/tag_count_reward/std": 0.24476754665374756, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1078774430314927, "frac_reward_zero_std": 0.0, "grad_norm": 0.6154208214482098, "kl": 2.62890625, "learning_rate": 1.9996406205661792e-05, "loss": 0.1052, "num_tokens": 184809217.0, "reward": 1.3759765625, "reward_std": 0.7340596914291382, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.5390625, "rewards/format_reward/std": 0.4994482398033142, "rewards/tag_count_reward/mean": 0.8017578125, "rewards/tag_count_reward/std": 0.2514796257019043, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1995.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1626.0, "completions/min_terminated_length": 0.0, "epoch": 0.10804813518818811, "frac_reward_zero_std": 0.0, "grad_norm": 0.2506743732895677, "kl": 3.00390625, "learning_rate": 1.9996244716864373e-05, "loss": 0.1201, "num_tokens": 185361153.0, "reward": 1.1806640625, "reward_std": 0.6507062911987305, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.359375, "rewards/format_reward/std": 0.4807571768760681, "rewards/tag_count_reward/mean": 0.7822265625, "rewards/tag_count_reward/std": 0.18835335969924927, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1926.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1313.0, "completions/min_terminated_length": 0.0, "epoch": 0.1082188273448835, "frac_reward_zero_std": 0.0, "grad_norm": 0.648525423095735, "kl": 3.2578125, "learning_rate": 1.9996079679760927e-05, "loss": 0.1303, "num_tokens": 185894049.0, "reward": 1.67578125, "reward_std": 0.5592831373214722, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 0.73828125, "rewards/format_reward/std": 0.4404313564300537, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.1746407449245453, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.10838951950157891, "frac_reward_zero_std": 0.0, "grad_norm": 0.2870474394646332, "kl": 0.4716796875, "learning_rate": 1.999591109441003e-05, "loss": 0.0189, "num_tokens": 186454961.0, "reward": 0.7275390625, "reward_std": 0.44705766439437866, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.3562295734882355, "rewards/format_reward/mean": 0.12890625, "rewards/format_reward/std": 0.33575257658958435, "rewards/tag_count_reward/mean": 0.4501953125, "rewards/tag_count_reward/std": 0.19099831581115723, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1085602116582743, "frac_reward_zero_std": 0.0, "grad_norm": 0.29043645705213217, "kl": 0.4814453125, "learning_rate": 1.9995738960871525e-05, "loss": 0.0193, "num_tokens": 187019505.0, "reward": 0.4404296875, "reward_std": 0.27341896295547485, "rewards/accuracy_reward/mean": 0.012500000186264515, "rewards/accuracy_reward/std": 0.11133462190628052, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.24253563582897186, "rewards/tag_count_reward/mean": 0.3662109375, "rewards/tag_count_reward/std": 0.15473221242427826, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1087309038149697, "frac_reward_zero_std": 0.0, "grad_norm": 0.5925478189116727, "kl": 0.5703125, "learning_rate": 1.999556327920652e-05, "loss": 0.0228, "num_tokens": 187582337.0, "reward": 0.849609375, "reward_std": 0.45706188678741455, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.55078125, "rewards/format_reward/std": 0.49838894605636597, "rewards/tag_count_reward/mean": 0.298828125, "rewards/tag_count_reward/std": 0.10872971266508102, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1089015959716651, "frac_reward_zero_std": 0.0, "grad_norm": 0.25293861443013643, "kl": 0.48388671875, "learning_rate": 1.999538404947736e-05, "loss": 0.0193, "num_tokens": 188144625.0, "reward": 1.1142578125, "reward_std": 0.54804527759552, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 0.6328125, "rewards/format_reward/std": 0.48298248648643494, "rewards/tag_count_reward/mean": 0.4150390625, "rewards/tag_count_reward/std": 0.15922114253044128, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1090722881283605, "frac_reward_zero_std": 0.0, "grad_norm": 0.24719856314581581, "kl": 0.38037109375, "learning_rate": 1.9995201271747682e-05, "loss": 0.0152, "num_tokens": 188711617.0, "reward": 1.486328125, "reward_std": 0.3858865201473236, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.8671875, "rewards/format_reward/std": 0.3400367796421051, "rewards/tag_count_reward/mean": 0.595703125, "rewards/tag_count_reward/std": 0.15693478286266327, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1092429802850559, "frac_reward_zero_std": 0.0, "grad_norm": 315.4074720228172, "kl": 2.212890625, "learning_rate": 1.9995014946082357e-05, "loss": 0.0885, "num_tokens": 189281249.0, "reward": 1.3876953125, "reward_std": 0.5086683630943298, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.68359375, "rewards/format_reward/std": 0.4659844934940338, "rewards/tag_count_reward/mean": 0.6845703125, "rewards/tag_count_reward/std": 0.20500171184539795, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1094136724417513, "frac_reward_zero_std": 0.0, "grad_norm": 0.14103479612377354, "kl": 0.192626953125, "learning_rate": 1.9994825072547527e-05, "loss": 0.0077, "num_tokens": 189844593.0, "reward": 1.375, "reward_std": 0.5316616296768188, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.62890625, "rewards/format_reward/std": 0.48404383659362793, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.1819017231464386, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1095843645984467, "frac_reward_zero_std": 0.0, "grad_norm": 0.1322583590091962, "kl": 0.17822265625, "learning_rate": 1.9994631651210586e-05, "loss": 0.0071, "num_tokens": 190410497.0, "reward": 1.5673828125, "reward_std": 0.4864317774772644, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 0.80078125, "rewards/format_reward/std": 0.40019527077674866, "rewards/tag_count_reward/mean": 0.6806640625, "rewards/tag_count_reward/std": 0.19760118424892426, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1097550567551421, "frac_reward_zero_std": 0.0, "grad_norm": 0.14401352224921488, "kl": 0.20751953125, "learning_rate": 1.9994434682140196e-05, "loss": 0.0083, "num_tokens": 190973185.0, "reward": 1.646484375, "reward_std": 0.25294920802116394, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21998079121112823, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.626953125, "rewards/tag_count_reward/std": 0.15177412331104279, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1099257489118375, "frac_reward_zero_std": 0.0, "grad_norm": 0.33836744904841276, "kl": 0.32421875, "learning_rate": 1.9994234165406272e-05, "loss": 0.013, "num_tokens": 191536241.0, "reward": 1.6025390625, "reward_std": 0.24454611539840698, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.6181640625, "rewards/tag_count_reward/std": 0.1467028111219406, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1100964410685329, "frac_reward_zero_std": 0.0, "grad_norm": 0.1574801525059125, "kl": 0.2021484375, "learning_rate": 1.999403010107999e-05, "loss": 0.0081, "num_tokens": 192099473.0, "reward": 1.6865234375, "reward_std": 0.3496113717556, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2749498784542084, "rewards/format_reward/mean": 0.92578125, "rewards/format_reward/std": 0.2626400291919708, "rewards/tag_count_reward/mean": 0.6787109375, "rewards/tag_count_reward/std": 0.14364273846149445, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1102671332252283, "frac_reward_zero_std": 0.0, "grad_norm": 0.17714932048704526, "kl": 0.255615234375, "learning_rate": 1.9993822489233784e-05, "loss": 0.0102, "num_tokens": 192669041.0, "reward": 1.3857421875, "reward_std": 0.39765045046806335, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2920515835285187, "rewards/tag_count_reward/mean": 0.4599609375, "rewards/tag_count_reward/std": 0.2386818379163742, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1104378253819237, "frac_reward_zero_std": 0.0, "grad_norm": 0.40417976889949375, "kl": 0.35302734375, "learning_rate": 1.9993611329941354e-05, "loss": 0.0141, "num_tokens": 193242385.0, "reward": 1.658203125, "reward_std": 0.1896948665380478, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.677734375, "rewards/tag_count_reward/std": 0.14229334890842438, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1106085175386191, "frac_reward_zero_std": 0.0, "grad_norm": 0.27263911149547637, "kl": 0.3359375, "learning_rate": 1.9993396623277647e-05, "loss": 0.0134, "num_tokens": 193813697.0, "reward": 1.6982421875, "reward_std": 0.24212154746055603, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.6943359375, "rewards/tag_count_reward/std": 0.1195463240146637, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1107792096953145, "frac_reward_zero_std": 0.0, "grad_norm": 0.2521364656846532, "kl": 0.4306640625, "learning_rate": 1.9993178369318884e-05, "loss": 0.0172, "num_tokens": 194376385.0, "reward": 1.712890625, "reward_std": 0.26544004678726196, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.693359375, "rewards/tag_count_reward/std": 0.10485678911209106, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1109499018520099, "frac_reward_zero_std": 0.0, "grad_norm": 0.2647429288924342, "kl": 0.47802734375, "learning_rate": 1.999295656814253e-05, "loss": 0.0191, "num_tokens": 194936881.0, "reward": 1.6865234375, "reward_std": 0.17083899676799774, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.6865234375, "rewards/tag_count_reward/std": 0.11124969273805618, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1111205940087053, "frac_reward_zero_std": 0.0, "grad_norm": 0.24259102417751663, "kl": 0.4921875, "learning_rate": 1.999273121982732e-05, "loss": 0.0197, "num_tokens": 195501121.0, "reward": 1.7080078125, "reward_std": 0.2322225719690323, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.6728515625, "rewards/tag_count_reward/std": 0.1238882839679718, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1112912861654007, "frac_reward_zero_std": 0.0, "grad_norm": 0.24816986272447009, "kl": 0.55078125, "learning_rate": 1.9992502324453244e-05, "loss": 0.022, "num_tokens": 196065489.0, "reward": 1.6728515625, "reward_std": 0.1668979972600937, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.6962890625, "rewards/tag_count_reward/std": 0.1028796136379242, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1114619783220961, "frac_reward_zero_std": 0.0625, "grad_norm": 0.25272300693342986, "kl": 0.529296875, "learning_rate": 1.999226988210155e-05, "loss": 0.0212, "num_tokens": 196627249.0, "reward": 1.7490234375, "reward_std": 0.20116811990737915, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.7177734375, "rewards/tag_count_reward/std": 0.08680903166532516, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1116326704787915, "frac_reward_zero_std": 0.0, "grad_norm": 0.23037399633830902, "kl": 0.49609375, "learning_rate": 1.999203389285475e-05, "loss": 0.0199, "num_tokens": 197194081.0, "reward": 1.765625, "reward_std": 0.3169826865196228, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 0.94140625, "rewards/format_reward/std": 0.23532284796237946, "rewards/tag_count_reward/mean": 0.7109375, "rewards/tag_count_reward/std": 0.10589256882667542, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1118033626354869, "frac_reward_zero_std": 0.1875, "grad_norm": 0.19874251012548014, "kl": 0.46875, "learning_rate": 1.999179435679661e-05, "loss": 0.0188, "num_tokens": 197757921.0, "reward": 1.708984375, "reward_std": 0.27937841415405273, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24253563582897186, "rewards/tag_count_reward/mean": 0.708984375, "rewards/tag_count_reward/std": 0.1119232252240181, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1119740547921823, "frac_reward_zero_std": 0.125, "grad_norm": 0.2035837761790215, "kl": 0.55078125, "learning_rate": 1.9991551274012156e-05, "loss": 0.022, "num_tokens": 198323441.0, "reward": 1.7333984375, "reward_std": 0.31652510166168213, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31272050738334656, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.28082075715065, "rewards/tag_count_reward/mean": 0.7099609375, "rewards/tag_count_reward/std": 0.11118081957101822, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1121447469488777, "frac_reward_zero_std": 0.0625, "grad_norm": 0.19923189837276828, "kl": 0.396484375, "learning_rate": 1.9991304644587672e-05, "loss": 0.0158, "num_tokens": 198887857.0, "reward": 1.6884765625, "reward_std": 0.30069756507873535, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.92578125, "rewards/format_reward/std": 0.2626400291919708, "rewards/tag_count_reward/mean": 0.7236328125, "rewards/tag_count_reward/std": 0.08596674352884293, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1123154391055731, "frac_reward_zero_std": 0.0, "grad_norm": 0.23098586043721134, "kl": 0.37744140625, "learning_rate": 1.9991054468610705e-05, "loss": 0.0151, "num_tokens": 199449873.0, "reward": 1.5693359375, "reward_std": 0.44297897815704346, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.8046875, "rewards/format_reward/std": 0.39721766114234924, "rewards/tag_count_reward/mean": 0.7333984375, "rewards/tag_count_reward/std": 0.07649025321006775, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1124861312622685, "frac_reward_zero_std": 0.0625, "grad_norm": 0.20956252780016352, "kl": 0.30908203125, "learning_rate": 1.999080074617006e-05, "loss": 0.0123, "num_tokens": 200018113.0, "reward": 1.646484375, "reward_std": 0.35282978415489197, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.88671875, "rewards/format_reward/std": 0.31755712628364563, "rewards/tag_count_reward/mean": 0.724609375, "rewards/tag_count_reward/std": 0.08767052739858627, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1126568234189639, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2030262432918514, "kl": 0.3427734375, "learning_rate": 1.9990543477355797e-05, "loss": 0.0137, "num_tokens": 200580977.0, "reward": 1.6796875, "reward_std": 0.2452220618724823, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.94140625, "rewards/format_reward/std": 0.23532284796237946, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.08247103542089462, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1128275155756593, "frac_reward_zero_std": 0.125, "grad_norm": 0.19611013457701415, "kl": 0.36865234375, "learning_rate": 1.9990282662259237e-05, "loss": 0.0147, "num_tokens": 201146017.0, "reward": 1.7744140625, "reward_std": 0.1819552481174469, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.7275390625, "rewards/tag_count_reward/std": 0.08707331866025925, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1129982077323547, "frac_reward_zero_std": 0.1875, "grad_norm": 0.20606374925883972, "kl": 0.37353515625, "learning_rate": 1.999001830097296e-05, "loss": 0.0149, "num_tokens": 201717825.0, "reward": 1.6865234375, "reward_std": 0.23159490525722504, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21178513765335083, "rewards/tag_count_reward/mean": 0.7255859375, "rewards/tag_count_reward/std": 0.0806812196969986, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1131688998890501, "frac_reward_zero_std": 0.125, "grad_norm": 0.2529969150703811, "kl": 0.39990234375, "learning_rate": 1.998975039359081e-05, "loss": 0.016, "num_tokens": 202284321.0, "reward": 1.7607421875, "reward_std": 0.2403927594423294, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2561737895011902, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.7255859375, "rewards/tag_count_reward/std": 0.08654393255710602, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1133395920457455, "frac_reward_zero_std": 0.125, "grad_norm": 3.6210521436724346, "kl": 0.601806640625, "learning_rate": 1.998947894020787e-05, "loss": 0.0241, "num_tokens": 202851377.0, "reward": 1.724609375, "reward_std": 0.12147148698568344, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.724609375, "rewards/tag_count_reward/std": 0.08188851922750473, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1135102842024409, "frac_reward_zero_std": 0.0, "grad_norm": 0.6287809538533373, "kl": 1.193359375, "learning_rate": 1.998920394092051e-05, "loss": 0.0478, "num_tokens": 203416625.0, "reward": 1.74609375, "reward_std": 0.22634106874465942, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2561737895011902, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.69921875, "rewards/tag_count_reward/std": 0.110077403485775, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1136809763591363, "frac_reward_zero_std": 0.0, "grad_norm": 0.14451911045245236, "kl": 0.326171875, "learning_rate": 1.9988925395826343e-05, "loss": 0.013, "num_tokens": 203983649.0, "reward": 1.73828125, "reward_std": 0.2597353160381317, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2561737895011902, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.69140625, "rewards/tag_count_reward/std": 0.13642629981040955, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1138516685158317, "frac_reward_zero_std": 0.0625, "grad_norm": 0.14212027177240724, "kl": 0.29736328125, "learning_rate": 1.9988643305024236e-05, "loss": 0.0119, "num_tokens": 204549297.0, "reward": 1.640625, "reward_std": 0.26320427656173706, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.20318391919136047, "rewards/tag_count_reward/mean": 0.66796875, "rewards/tag_count_reward/std": 0.17323161661624908, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11402236067252709, "frac_reward_zero_std": 0.0625, "grad_norm": 0.1980609994054226, "kl": 0.556640625, "learning_rate": 1.998835766861433e-05, "loss": 0.0223, "num_tokens": 205115089.0, "reward": 1.66796875, "reward_std": 0.31200993061065674, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21998079121112823, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24253563582897186, "rewards/tag_count_reward/mean": 0.6796875, "rewards/tag_count_reward/std": 0.1579200029373169, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2046.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 2035.0, "completions/min_terminated_length": 0.0, "epoch": 0.1141930528292225, "frac_reward_zero_std": 0.0, "grad_norm": 0.8772529161330743, "kl": 1.505859375, "learning_rate": 1.9988068486698008e-05, "loss": 0.0602, "num_tokens": 205676161.0, "reward": 1.669921875, "reward_std": 0.27229952812194824, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.94921875, "rewards/format_reward/std": 0.21998079121112823, "rewards/tag_count_reward/mean": 0.685546875, "rewards/tag_count_reward/std": 0.1525794416666031, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11436374498591789, "frac_reward_zero_std": 0.0, "grad_norm": 0.2739830625929278, "kl": 0.869140625, "learning_rate": 1.9987775759377923e-05, "loss": 0.0347, "num_tokens": 206245889.0, "reward": 1.7646484375, "reward_std": 0.3149721920490265, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3077581524848938, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.6982421875, "rewards/tag_count_reward/std": 0.12721310555934906, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1145344371426133, "frac_reward_zero_std": 0.0, "grad_norm": 0.16032561381377078, "kl": 0.9033203125, "learning_rate": 1.998747948675798e-05, "loss": 0.0361, "num_tokens": 206808241.0, "reward": 1.6826171875, "reward_std": 0.31629860401153564, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21178513765335083, "rewards/tag_count_reward/mean": 0.6396484375, "rewards/tag_count_reward/std": 0.18063144385814667, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11470512929930869, "frac_reward_zero_std": 0.0, "grad_norm": 0.2107712150483989, "kl": 1.318359375, "learning_rate": 1.9987179668943345e-05, "loss": 0.0527, "num_tokens": 207373969.0, "reward": 1.603515625, "reward_std": 0.3728443384170532, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 0.92578125, "rewards/format_reward/std": 0.2626400291919708, "rewards/tag_count_reward/mean": 0.611328125, "rewards/tag_count_reward/std": 0.20486389100551605, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1148758214560041, "frac_reward_zero_std": 0.0, "grad_norm": 0.2460724678350884, "kl": 1.18359375, "learning_rate": 1.9986876306040445e-05, "loss": 0.0473, "num_tokens": 207940673.0, "reward": 1.60546875, "reward_std": 0.36510246992111206, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.91015625, "rewards/format_reward/std": 0.2865179479122162, "rewards/tag_count_reward/mean": 0.6328125, "rewards/tag_count_reward/std": 0.1810576319694519, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11504651361269949, "frac_reward_zero_std": 0.0, "grad_norm": 0.2103020799624596, "kl": 0.810546875, "learning_rate": 1.9986569398156962e-05, "loss": 0.0324, "num_tokens": 208504897.0, "reward": 1.6513671875, "reward_std": 0.376197874546051, "rewards/accuracy_reward/mean": 0.07083333283662796, "rewards/accuracy_reward/std": 0.25708237290382385, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.28082075715065, "rewards/tag_count_reward/mean": 0.6708984375, "rewards/tag_count_reward/std": 0.1560630053281784, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1152172057693949, "frac_reward_zero_std": 0.0, "grad_norm": 0.3140993306779774, "kl": 0.7177734375, "learning_rate": 1.998625894540184e-05, "loss": 0.0287, "num_tokens": 209068049.0, "reward": 1.5869140625, "reward_std": 0.3344024121761322, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.87109375, "rewards/format_reward/std": 0.33575257658958435, "rewards/tag_count_reward/mean": 0.7001953125, "rewards/tag_count_reward/std": 0.1279934197664261, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11538789792609029, "frac_reward_zero_std": 0.0625, "grad_norm": 0.20705078154353318, "kl": 0.578125, "learning_rate": 1.998594494788527e-05, "loss": 0.0231, "num_tokens": 209644689.0, "reward": 1.697265625, "reward_std": 0.33826950192451477, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.2920515835285187, "rewards/format_reward/mean": 0.91015625, "rewards/format_reward/std": 0.2865179479122162, "rewards/tag_count_reward/mean": 0.693359375, "rewards/tag_count_reward/std": 0.12991265952587128, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1155585900827857, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4014709394090809, "kl": 0.6162109375, "learning_rate": 1.998562740571872e-05, "loss": 0.0246, "num_tokens": 210208913.0, "reward": 1.630859375, "reward_std": 0.4073032736778259, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 0.87109375, "rewards/format_reward/std": 0.33575257658958435, "rewards/tag_count_reward/mean": 0.693359375, "rewards/tag_count_reward/std": 0.13363267481327057, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11572928223948109, "frac_reward_zero_std": 0.0, "grad_norm": 0.20473211432336427, "kl": 0.4462890625, "learning_rate": 1.9985306319014898e-05, "loss": 0.0178, "num_tokens": 210780433.0, "reward": 1.701171875, "reward_std": 0.3250214457511902, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.22781464457511902, "rewards/tag_count_reward/mean": 0.716796875, "rewards/tag_count_reward/std": 0.10323727875947952, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1158999743961765, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2248333951957876, "kl": 0.3857421875, "learning_rate": 1.9984981687887783e-05, "loss": 0.0154, "num_tokens": 211340929.0, "reward": 1.6572265625, "reward_std": 0.2646145224571228, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24253563582897186, "rewards/tag_count_reward/mean": 0.7197265625, "rewards/tag_count_reward/std": 0.09026933461427689, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11607066655287189, "frac_reward_zero_std": 0.1875, "grad_norm": 0.6486408534907112, "kl": 0.42578125, "learning_rate": 1.998465351245261e-05, "loss": 0.017, "num_tokens": 211906577.0, "reward": 1.7763671875, "reward_std": 0.20046323537826538, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.7333984375, "rewards/tag_count_reward/std": 0.062369659543037415, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1162413587095673, "frac_reward_zero_std": 0.1875, "grad_norm": 1.5658961504575795, "kl": 0.513671875, "learning_rate": 1.998432179282586e-05, "loss": 0.0205, "num_tokens": 212469905.0, "reward": 1.791015625, "reward_std": 0.2797257900238037, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.720703125, "rewards/tag_count_reward/std": 0.08922924101352692, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11641205086626269, "frac_reward_zero_std": 0.125, "grad_norm": 0.19102552488891988, "kl": 0.43896484375, "learning_rate": 1.998398652912529e-05, "loss": 0.0176, "num_tokens": 213040113.0, "reward": 1.767578125, "reward_std": 0.19338448345661163, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.728515625, "rewards/tag_count_reward/std": 0.07361361384391785, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1165827430229581, "frac_reward_zero_std": 0.0, "grad_norm": 0.20367954056437412, "kl": 0.43212890625, "learning_rate": 1.99836477214699e-05, "loss": 0.0173, "num_tokens": 213600081.0, "reward": 1.708984375, "reward_std": 0.2925114631652832, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.21840041875839233, "rewards/format_reward/mean": 0.94140625, "rewards/format_reward/std": 0.23532284796237946, "rewards/tag_count_reward/mean": 0.720703125, "rewards/tag_count_reward/std": 0.08056841045618057, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11675343517965349, "frac_reward_zero_std": 0.1875, "grad_norm": 0.1945952466848101, "kl": 0.43896484375, "learning_rate": 1.9983305369979962e-05, "loss": 0.0176, "num_tokens": 214169969.0, "reward": 1.658203125, "reward_std": 0.2811988890171051, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.93359375, "rewards/format_reward/std": 0.24947863817214966, "rewards/tag_count_reward/mean": 0.712890625, "rewards/tag_count_reward/std": 0.09945856034755707, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1169241273363489, "frac_reward_zero_std": 0.0, "grad_norm": 0.19430936261152512, "kl": 0.4306640625, "learning_rate": 1.9982959474776993e-05, "loss": 0.0172, "num_tokens": 214730833.0, "reward": 1.7412109375, "reward_std": 0.2579788565635681, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.7177734375, "rewards/tag_count_reward/std": 0.09490203112363815, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11709481949304429, "frac_reward_zero_std": 0.125, "grad_norm": 0.20938667435600083, "kl": 0.3798828125, "learning_rate": 1.998261003598377e-05, "loss": 0.0152, "num_tokens": 215292913.0, "reward": 1.6943359375, "reward_std": 0.20857754349708557, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.7216796875, "rewards/tag_count_reward/std": 0.0881660208106041, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1172655116497397, "frac_reward_zero_std": 0.125, "grad_norm": 0.18472271446476599, "kl": 0.453125, "learning_rate": 1.998225705372434e-05, "loss": 0.0181, "num_tokens": 215856081.0, "reward": 1.66796875, "reward_std": 0.20830203592777252, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.69921875, "rewards/tag_count_reward/std": 0.1266436129808426, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11743620380643509, "frac_reward_zero_std": 0.125, "grad_norm": 0.18823140285870987, "kl": 0.4697265625, "learning_rate": 1.9981900528123995e-05, "loss": 0.0188, "num_tokens": 216423185.0, "reward": 1.7138671875, "reward_std": 0.22368668019771576, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.7099609375, "rewards/tag_count_reward/std": 0.10435797274112701, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1176068959631305, "frac_reward_zero_std": 0.125, "grad_norm": 0.21290201053740365, "kl": 0.5947265625, "learning_rate": 1.998154045930929e-05, "loss": 0.0238, "num_tokens": 216992865.0, "reward": 1.875, "reward_std": 0.33159178495407104, "rewards/accuracy_reward/mean": 0.19140625, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.09650764614343643, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11777758811982589, "frac_reward_zero_std": 0.0, "grad_norm": 0.21338071480377085, "kl": 0.51904296875, "learning_rate": 1.998117684740803e-05, "loss": 0.0207, "num_tokens": 217556545.0, "reward": 1.658203125, "reward_std": 0.41227227449417114, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 0.87109375, "rewards/format_reward/std": 0.33575257658958435, "rewards/tag_count_reward/mean": 0.720703125, "rewards/tag_count_reward/std": 0.09712077677249908, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1179482802765213, "frac_reward_zero_std": 0.0, "grad_norm": 2.7087556207269525, "kl": 0.779296875, "learning_rate": 1.998080969254929e-05, "loss": 0.0311, "num_tokens": 218128929.0, "reward": 1.4365234375, "reward_std": 0.5210967063903809, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 0.6328125, "rewards/format_reward/std": 0.48298248648643494, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.10186959058046341, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11811897243321669, "frac_reward_zero_std": 0.0, "grad_norm": 0.22183364847298162, "kl": 0.953125, "learning_rate": 1.9980438994863402e-05, "loss": 0.0382, "num_tokens": 218693969.0, "reward": 1.2001953125, "reward_std": 0.5191805958747864, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.42578125, "rewards/format_reward/std": 0.49542948603630066, "rewards/tag_count_reward/mean": 0.7626953125, "rewards/tag_count_reward/std": 0.11957835406064987, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1649.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 1242.0, "completions/min_terminated_length": 0.0, "epoch": 0.1182896645899121, "frac_reward_zero_std": 0.0, "grad_norm": 0.2043738873143181, "kl": 2.04296875, "learning_rate": 1.9980064754481937e-05, "loss": 0.0817, "num_tokens": 219155377.0, "reward": 1.1572265625, "reward_std": 0.5144891142845154, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.390625, "rewards/format_reward/std": 0.48884621262550354, "rewards/tag_count_reward/mean": 0.7509765625, "rewards/tag_count_reward/std": 0.11819319427013397, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1586.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 1108.0, "completions/min_terminated_length": 0.0, "epoch": 0.11846035674660749, "frac_reward_zero_std": 0.0, "grad_norm": 0.3351019915408988, "kl": 2.4375, "learning_rate": 1.9979686971537747e-05, "loss": 0.0975, "num_tokens": 219602161.0, "reward": 1.4443359375, "reward_std": 0.46537336707115173, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.68359375, "rewards/format_reward/std": 0.4659844934940338, "rewards/tag_count_reward/mean": 0.7529296875, "rewards/tag_count_reward/std": 0.09257330000400543, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1423.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1226.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 1058.0, "completions/min_terminated_length": 0.0, "epoch": 0.1186310489033029, "frac_reward_zero_std": 0.125, "grad_norm": 0.34716881597727745, "kl": 2.59375, "learning_rate": 1.9979305646164926e-05, "loss": 0.1037, "num_tokens": 219956145.0, "reward": 1.7021484375, "reward_std": 0.3422727584838867, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 0.8984375, "rewards/format_reward/std": 0.3026638329029083, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.08334290981292725, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1443.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1192.0, "completions/min_terminated_length": 0.0, "epoch": 0.11880174105999829, "frac_reward_zero_std": 0.125, "grad_norm": 0.24367896053553426, "kl": 2.6796875, "learning_rate": 1.9978920778498835e-05, "loss": 0.1072, "num_tokens": 220368785.0, "reward": 1.841796875, "reward_std": 0.2893384099006653, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.740234375, "rewards/tag_count_reward/std": 0.048530805855989456, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1848.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1667.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1499.0, "completions/min_terminated_length": 0.0, "epoch": 0.1189724332166937, "frac_reward_zero_std": 0.4375, "grad_norm": 0.24714735725620757, "kl": 2.64453125, "learning_rate": 1.9978532368676084e-05, "loss": 0.1058, "num_tokens": 220837841.0, "reward": 1.81640625, "reward_std": 0.1538832038640976, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.057389069348573685, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1947.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1693.0, "completions/min_terminated_length": 0.0, "epoch": 0.11914312537338909, "frac_reward_zero_std": 0.25, "grad_norm": 0.23534312376658925, "kl": 2.55859375, "learning_rate": 1.997814041683455e-05, "loss": 0.1025, "num_tokens": 221376945.0, "reward": 1.75, "reward_std": 0.22695331275463104, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.0794435366988182, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11931381753008449, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2090825516189387, "kl": 1.9765625, "learning_rate": 1.9977744923113356e-05, "loss": 0.0791, "num_tokens": 221941649.0, "reward": 1.7041015625, "reward_std": 0.1727205216884613, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.7158203125, "rewards/tag_count_reward/std": 0.09157506376504898, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11948450968677989, "frac_reward_zero_std": 0.0625, "grad_norm": 0.18960884647113213, "kl": 1.150390625, "learning_rate": 1.997734588765289e-05, "loss": 0.046, "num_tokens": 222501969.0, "reward": 1.7666015625, "reward_std": 0.17034369707107544, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22781464457511902, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.7158203125, "rewards/tag_count_reward/std": 0.09157506376504898, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11965520184347529, "frac_reward_zero_std": 0.0625, "grad_norm": 0.23371664175500434, "kl": 0.7001953125, "learning_rate": 1.997694331059479e-05, "loss": 0.028, "num_tokens": 223068545.0, "reward": 1.7275390625, "reward_std": 0.1480671614408493, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.1564512401819229, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.7119140625, "rewards/tag_count_reward/std": 0.09784265607595444, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1198258940001707, "frac_reward_zero_std": 0.0625, "grad_norm": 0.23380006012294313, "kl": 0.56689453125, "learning_rate": 1.9976537192081972e-05, "loss": 0.0227, "num_tokens": 223635393.0, "reward": 1.7783203125, "reward_std": 0.18774652481079102, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.7236328125, "rewards/tag_count_reward/std": 0.08877205103635788, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11999658615686609, "frac_reward_zero_std": 0.3125, "grad_norm": 0.1844417485611113, "kl": 0.6064453125, "learning_rate": 1.9976127532258574e-05, "loss": 0.0242, "num_tokens": 224195169.0, "reward": 1.71875, "reward_std": 0.1317610889673233, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.07415785640478134, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1201672783135615, "frac_reward_zero_std": 0.3125, "grad_norm": 0.17135367794415915, "kl": 0.5908203125, "learning_rate": 1.9975714331270026e-05, "loss": 0.0236, "num_tokens": 224756721.0, "reward": 1.71875, "reward_std": 0.16165989637374878, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.061511795967817307, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12033797047025689, "frac_reward_zero_std": 0.0, "grad_norm": 0.26577579563365805, "kl": 0.4853515625, "learning_rate": 1.997529758926299e-05, "loss": 0.0194, "num_tokens": 225322513.0, "reward": 1.6259765625, "reward_std": 0.3700172007083893, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.85546875, "rewards/format_reward/std": 0.35231640934944153, "rewards/tag_count_reward/mean": 0.7431640625, "rewards/tag_count_reward/std": 0.040850620716810226, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1205086626269523, "frac_reward_zero_std": 0.0, "grad_norm": 0.1947543374807183, "kl": 0.353515625, "learning_rate": 1.9974877306385398e-05, "loss": 0.0141, "num_tokens": 225893777.0, "reward": 1.53515625, "reward_std": 0.4439639151096344, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.41420844197273254, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.06537505239248276, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12067935478364769, "frac_reward_zero_std": 0.0, "grad_norm": 0.19068505414251546, "kl": 0.322265625, "learning_rate": 1.9974453482786437e-05, "loss": 0.0129, "num_tokens": 226459425.0, "reward": 1.529296875, "reward_std": 0.4105759859085083, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.7734375, "rewards/format_reward/std": 0.41942715644836426, "rewards/tag_count_reward/mean": 0.728515625, "rewards/tag_count_reward/std": 0.08871270716190338, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1208500469403431, "frac_reward_zero_std": 0.0, "grad_norm": 0.19549490754964474, "kl": 0.32373046875, "learning_rate": 1.9974026118616542e-05, "loss": 0.0129, "num_tokens": 227029489.0, "reward": 1.6748046875, "reward_std": 0.32183393836021423, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26889389753341675, "rewards/tag_count_reward/mean": 0.7060546875, "rewards/tag_count_reward/std": 0.12236421555280685, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12102073909703849, "frac_reward_zero_std": 0.0, "grad_norm": 0.1684094997421325, "kl": 0.32958984375, "learning_rate": 1.997359521402742e-05, "loss": 0.0132, "num_tokens": 227596337.0, "reward": 1.662109375, "reward_std": 0.27147024869918823, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22781464457511902, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.623046875, "rewards/tag_count_reward/std": 0.20048004388809204, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1211914312537339, "frac_reward_zero_std": 0.0, "grad_norm": 0.21630703612828858, "kl": 0.357421875, "learning_rate": 1.9973160769172023e-05, "loss": 0.0143, "num_tokens": 228158561.0, "reward": 1.6376953125, "reward_std": 0.2764812707901001, "rewards/accuracy_reward/mean": 0.05416666716337204, "rewards/accuracy_reward/std": 0.22681932151317596, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.5986328125, "rewards/tag_count_reward/std": 0.20600801706314087, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12136212341042929, "frac_reward_zero_std": 0.0, "grad_norm": 0.22584211399001541, "kl": 0.35205078125, "learning_rate": 1.9972722784204563e-05, "loss": 0.0141, "num_tokens": 228723985.0, "reward": 1.7080078125, "reward_std": 0.23050859570503235, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21998079121112823, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.6728515625, "rewards/tag_count_reward/std": 0.15066933631896973, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1215328155671247, "frac_reward_zero_std": 0.0625, "grad_norm": 0.1862686646931076, "kl": 0.36669921875, "learning_rate": 1.9972281259280505e-05, "loss": 0.0147, "num_tokens": 229293361.0, "reward": 1.8212890625, "reward_std": 0.24260252714157104, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.7158203125, "rewards/tag_count_reward/std": 0.1041007786989212, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12170350772382009, "frac_reward_zero_std": 0.0, "grad_norm": 0.19057668668095853, "kl": 0.36767578125, "learning_rate": 1.9971836194556583e-05, "loss": 0.0147, "num_tokens": 229859489.0, "reward": 1.71484375, "reward_std": 0.3078286051750183, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21998079121112823, "rewards/format_reward/mean": 0.94921875, "rewards/format_reward/std": 0.21998079121112823, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.08984941244125366, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1218741998805155, "frac_reward_zero_std": 0.0625, "grad_norm": 0.19582607461472124, "kl": 0.341796875, "learning_rate": 1.9971387590190776e-05, "loss": 0.0137, "num_tokens": 230423633.0, "reward": 1.7236328125, "reward_std": 0.2597852349281311, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.22781464457511902, "rewards/tag_count_reward/mean": 0.7158203125, "rewards/tag_count_reward/std": 0.10171910375356674, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12204489203721089, "frac_reward_zero_std": 0.0625, "grad_norm": 0.17368011409710304, "kl": 0.32861328125, "learning_rate": 1.9970935446342315e-05, "loss": 0.0131, "num_tokens": 230999969.0, "reward": 1.7197265625, "reward_std": 0.35337647795677185, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 0.91796875, "rewards/format_reward/std": 0.2749498784542084, "rewards/tag_count_reward/mean": 0.7158203125, "rewards/tag_count_reward/std": 0.09678007662296295, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1222155841939063, "frac_reward_zero_std": 0.0, "grad_norm": 0.20879346442950303, "kl": 0.33837890625, "learning_rate": 1.9970479763171705e-05, "loss": 0.0135, "num_tokens": 231568593.0, "reward": 1.63671875, "reward_std": 0.3823738098144531, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.87109375, "rewards/format_reward/std": 0.33575257658958435, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.09393364936113358, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12238627635060169, "frac_reward_zero_std": 0.0, "grad_norm": 0.19484874683630293, "kl": 0.35791015625, "learning_rate": 1.9970020540840696e-05, "loss": 0.0143, "num_tokens": 232133841.0, "reward": 1.7353515625, "reward_std": 0.4021226763725281, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.36746934056282043, "rewards/format_reward/mean": 0.84765625, "rewards/format_reward/std": 0.3600577116012573, "rewards/tag_count_reward/mean": 0.7275390625, "rewards/tag_count_reward/std": 0.07817400991916656, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1225569685072971, "frac_reward_zero_std": 0.0625, "grad_norm": 0.1859934747250872, "kl": 0.3232421875, "learning_rate": 1.9969557779512287e-05, "loss": 0.0129, "num_tokens": 232704673.0, "reward": 1.6298828125, "reward_std": 0.3828265368938446, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3483152687549591, "rewards/tag_count_reward/mean": 0.7236328125, "rewards/tag_count_reward/std": 0.08877205103635788, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12272766066399249, "frac_reward_zero_std": 0.1875, "grad_norm": 0.18444326672761405, "kl": 0.337890625, "learning_rate": 1.9969091479350745e-05, "loss": 0.0135, "num_tokens": 233274145.0, "reward": 1.74609375, "reward_std": 0.2544921338558197, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24253563582897186, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.06454972177743912, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1228983528206879, "frac_reward_zero_std": 0.3125, "grad_norm": 0.16446060638496732, "kl": 0.36083984375, "learning_rate": 1.9968621640521596e-05, "loss": 0.0144, "num_tokens": 233841745.0, "reward": 1.7060546875, "reward_std": 0.19328509271144867, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.7255859375, "rewards/tag_count_reward/std": 0.0775839164853096, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12306904497738329, "frac_reward_zero_std": 0.25, "grad_norm": 0.23954613720683074, "kl": 0.38720703125, "learning_rate": 1.996814826319161e-05, "loss": 0.0155, "num_tokens": 234407217.0, "reward": 1.7548828125, "reward_std": 0.16206541657447815, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.7275390625, "rewards/tag_count_reward/std": 0.07817400991916656, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1232397371340787, "frac_reward_zero_std": 0.0, "grad_norm": 0.20271510734237047, "kl": 0.3623046875, "learning_rate": 1.9967671347528822e-05, "loss": 0.0145, "num_tokens": 234977777.0, "reward": 1.892578125, "reward_std": 0.327544242143631, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41420844197273254, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.712890625, "rewards/tag_count_reward/std": 0.10887051373720169, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12341042929077409, "frac_reward_zero_std": 0.1875, "grad_norm": 0.1715518509337217, "kl": 0.3505859375, "learning_rate": 1.996719089370251e-05, "loss": 0.014, "num_tokens": 235544657.0, "reward": 1.7001953125, "reward_std": 0.15626785159111023, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.7197265625, "rewards/tag_count_reward/std": 0.08466499298810959, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1235811214474695, "frac_reward_zero_std": 0.0625, "grad_norm": 0.20231219854757723, "kl": 0.3505859375, "learning_rate": 1.9966706901883236e-05, "loss": 0.014, "num_tokens": 236114337.0, "reward": 1.6669921875, "reward_std": 0.2731642425060272, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.92578125, "rewards/format_reward/std": 0.2626400291919708, "rewards/tag_count_reward/mean": 0.7216796875, "rewards/tag_count_reward/std": 0.0881660208106041, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12375181360416489, "frac_reward_zero_std": 0.0625, "grad_norm": 0.18331734339877198, "kl": 0.31787109375, "learning_rate": 1.996621937224278e-05, "loss": 0.0127, "num_tokens": 236679057.0, "reward": 1.71875, "reward_std": 0.32663747668266296, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.2920515835285187, "rewards/format_reward/mean": 0.90234375, "rewards/format_reward/std": 0.29743078351020813, "rewards/tag_count_reward/mean": 0.72265625, "rewards/tag_count_reward/std": 0.08707881718873978, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1239225057608603, "frac_reward_zero_std": 0.0, "grad_norm": 0.18121661608888023, "kl": 0.32763671875, "learning_rate": 1.9965728304954213e-05, "loss": 0.0131, "num_tokens": 237253521.0, "reward": 1.5703125, "reward_std": 0.3591051697731018, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33136674761772156, "rewards/tag_count_reward/mean": 0.67578125, "rewards/tag_count_reward/std": 0.1323223114013672, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12409319791755569, "frac_reward_zero_std": 0.0, "grad_norm": 0.1667298299327286, "kl": 0.31103515625, "learning_rate": 1.9965233700191837e-05, "loss": 0.0124, "num_tokens": 237815905.0, "reward": 1.662109375, "reward_std": 0.25054144859313965, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.22781464457511902, "rewards/tag_count_reward/mean": 0.705078125, "rewards/tag_count_reward/std": 0.10353361815214157, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1242638900742511, "frac_reward_zero_std": 0.0, "grad_norm": 0.17896682420718465, "kl": 0.31591796875, "learning_rate": 1.9964735558131223e-05, "loss": 0.0126, "num_tokens": 238375217.0, "reward": 1.6689453125, "reward_std": 0.2619393765926361, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.94140625, "rewards/format_reward/std": 0.23532284796237946, "rewards/tag_count_reward/mean": 0.7001953125, "rewards/tag_count_reward/std": 0.11380220949649811, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12443458223094649, "frac_reward_zero_std": 0.0625, "grad_norm": 0.16942602184048025, "kl": 0.3134765625, "learning_rate": 1.9964233878949194e-05, "loss": 0.0125, "num_tokens": 238940193.0, "reward": 1.7021484375, "reward_std": 0.3538602292537689, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.92578125, "rewards/format_reward/std": 0.2626400291919708, "rewards/tag_count_reward/mean": 0.7138671875, "rewards/tag_count_reward/std": 0.10103914141654968, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12460527438764189, "frac_reward_zero_std": 0.0625, "grad_norm": 10.875925661786635, "kl": 1.421875, "learning_rate": 1.9963728662823823e-05, "loss": 0.0569, "num_tokens": 239505217.0, "reward": 1.7373046875, "reward_std": 0.29755058884620667, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21178513765335083, "rewards/tag_count_reward/mean": 0.7255859375, "rewards/tag_count_reward/std": 0.0775839164853096, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12477596654433729, "frac_reward_zero_std": 0.125, "grad_norm": 0.16779869587429003, "kl": 0.3857421875, "learning_rate": 1.9963219909934448e-05, "loss": 0.0154, "num_tokens": 240071201.0, "reward": 1.7109375, "reward_std": 0.3383483290672302, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21998079121112823, "rewards/format_reward/mean": 0.94140625, "rewards/format_reward/std": 0.23532284796237946, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.11070186644792557, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12494665870103269, "frac_reward_zero_std": 0.0, "grad_norm": 0.28234414565646354, "kl": 0.8720703125, "learning_rate": 1.9962707620461653e-05, "loss": 0.0349, "num_tokens": 240636225.0, "reward": 1.6005859375, "reward_std": 0.36315566301345825, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.88671875, "rewards/format_reward/std": 0.31755712628364563, "rewards/tag_count_reward/mean": 0.7060546875, "rewards/tag_count_reward/std": 0.12435110658407211, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1251173508577281, "frac_reward_zero_std": 0.0, "grad_norm": 0.2225392923918815, "kl": 1.0458984375, "learning_rate": 1.9962191794587292e-05, "loss": 0.0419, "num_tokens": 241196177.0, "reward": 1.6259765625, "reward_std": 0.46437740325927734, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.87109375, "rewards/format_reward/std": 0.33575257658958435, "rewards/tag_count_reward/mean": 0.6923828125, "rewards/tag_count_reward/std": 0.13594472408294678, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12528804301442348, "frac_reward_zero_std": 0.0, "grad_norm": 0.19367257914729288, "kl": 1.0908203125, "learning_rate": 1.9961672432494456e-05, "loss": 0.0437, "num_tokens": 241767793.0, "reward": 1.5830078125, "reward_std": 0.40292543172836304, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.87890625, "rewards/format_reward/std": 0.3268752694129944, "rewards/tag_count_reward/mean": 0.6728515625, "rewards/tag_count_reward/std": 0.15388840436935425, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12545873517111888, "frac_reward_zero_std": 0.0, "grad_norm": 0.3058085045067042, "kl": 1.3359375, "learning_rate": 1.99611495343675e-05, "loss": 0.0534, "num_tokens": 242330161.0, "reward": 1.63671875, "reward_std": 0.3125320076942444, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.94921875, "rewards/format_reward/std": 0.21998079121112823, "rewards/tag_count_reward/mean": 0.65625, "rewards/tag_count_reward/std": 0.16118435561656952, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1256294273278143, "frac_reward_zero_std": 0.0, "grad_norm": 0.24990630221291552, "kl": 0.6357421875, "learning_rate": 1.996062310039204e-05, "loss": 0.0254, "num_tokens": 242894321.0, "reward": 1.6953125, "reward_std": 0.2983081042766571, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.69140625, "rewards/tag_count_reward/std": 0.12321022897958755, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1258001194845097, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2440636299575953, "kl": 0.45068359375, "learning_rate": 1.996009313075493e-05, "loss": 0.018, "num_tokens": 243455617.0, "reward": 1.69921875, "reward_std": 0.24269999563694, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.703125, "rewards/tag_count_reward/std": 0.1181972473859787, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12597081164120508, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5808460960477073, "kl": 0.4716796875, "learning_rate": 1.9959559625644304e-05, "loss": 0.0189, "num_tokens": 244021073.0, "reward": 1.734375, "reward_std": 0.23405210673809052, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22781464457511902, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.70703125, "rewards/tag_count_reward/std": 0.11118512600660324, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12614150379790048, "frac_reward_zero_std": 0.0, "grad_norm": 0.21097363551731582, "kl": 0.38720703125, "learning_rate": 1.995902258524953e-05, "loss": 0.0155, "num_tokens": 244586273.0, "reward": 1.7529296875, "reward_std": 0.30732589960098267, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2749498784542084, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.7060546875, "rewards/tag_count_reward/std": 0.10968967527151108, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1263121959545959, "frac_reward_zero_std": 0.125, "grad_norm": 0.1973770169812831, "kl": 0.31787109375, "learning_rate": 1.9958482009761234e-05, "loss": 0.0127, "num_tokens": 245151233.0, "reward": 1.7900390625, "reward_std": 0.25077903270721436, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31272050738334656, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.7158203125, "rewards/tag_count_reward/std": 0.09928031265735626, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1264828881112913, "frac_reward_zero_std": 0.1875, "grad_norm": 0.16156638133266246, "kl": 0.36083984375, "learning_rate": 1.9957937899371308e-05, "loss": 0.0145, "num_tokens": 245716865.0, "reward": 1.7470703125, "reward_std": 0.1718634068965912, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.7236328125, "rewards/tag_count_reward/std": 0.08306674659252167, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12665358026798668, "frac_reward_zero_std": 0.0, "grad_norm": 0.18821937850934412, "kl": 0.390625, "learning_rate": 1.9957390254272893e-05, "loss": 0.0156, "num_tokens": 246282097.0, "reward": 1.6884765625, "reward_std": 0.2149449586868286, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.7080078125, "rewards/tag_count_reward/std": 0.11045510321855545, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12682427242468208, "frac_reward_zero_std": 0.0625, "grad_norm": 0.32247433834048017, "kl": 0.48876953125, "learning_rate": 1.9956839074660372e-05, "loss": 0.0195, "num_tokens": 246861873.0, "reward": 1.7490234375, "reward_std": 0.34313827753067017, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.20318391919136047, "rewards/tag_count_reward/mean": 0.7021484375, "rewards/tag_count_reward/std": 0.1268211454153061, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1269949645813775, "frac_reward_zero_std": 0.1875, "grad_norm": 0.22608216393302213, "kl": 0.5, "learning_rate": 1.9956284360729404e-05, "loss": 0.02, "num_tokens": 247427329.0, "reward": 1.73046875, "reward_std": 0.16016709804534912, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.72265625, "rewards/tag_count_reward/std": 0.1001683697104454, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1271656567380729, "frac_reward_zero_std": 0.0, "grad_norm": 0.15621226484025214, "kl": 0.335693359375, "learning_rate": 1.9955726112676887e-05, "loss": 0.0134, "num_tokens": 247991137.0, "reward": 1.7578125, "reward_std": 0.3180091977119446, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.6953125, "rewards/tag_count_reward/std": 0.14327234029769897, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12733634889476828, "frac_reward_zero_std": 0.0, "grad_norm": 0.19895604081206042, "kl": 0.36865234375, "learning_rate": 1.9955164330700985e-05, "loss": 0.0148, "num_tokens": 248556161.0, "reward": 1.7607421875, "reward_std": 0.2618296444416046, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.2920515835285187, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.6826171875, "rewards/tag_count_reward/std": 0.15996502339839935, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12750704105146368, "frac_reward_zero_std": 0.0, "grad_norm": 0.16937390327997692, "kl": 0.318359375, "learning_rate": 1.99545990150011e-05, "loss": 0.0127, "num_tokens": 249117873.0, "reward": 1.7021484375, "reward_std": 0.21059267222881317, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.7138671875, "rewards/tag_count_reward/std": 0.11251617223024368, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1276777332081591, "frac_reward_zero_std": 0.125, "grad_norm": 0.6532627144043602, "kl": 0.45361328125, "learning_rate": 1.995403016577791e-05, "loss": 0.0182, "num_tokens": 249684561.0, "reward": 1.724609375, "reward_std": 0.25266411900520325, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.724609375, "rewards/tag_count_reward/std": 0.09569070488214493, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1278484253648545, "frac_reward_zero_std": 0.0, "grad_norm": 0.3498360201970761, "kl": 0.4248046875, "learning_rate": 1.9953457783233328e-05, "loss": 0.017, "num_tokens": 250255217.0, "reward": 1.6982421875, "reward_std": 0.27671539783477783, "rewards/accuracy_reward/mean": 0.03333333507180214, "rewards/accuracy_reward/std": 0.17988063395023346, "rewards/format_reward/mean": 0.94921875, "rewards/format_reward/std": 0.21998079121112823, "rewards/tag_count_reward/mean": 0.7177734375, "rewards/tag_count_reward/std": 0.10472430288791656, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12801911752154987, "frac_reward_zero_std": 0.1875, "grad_norm": 0.17434824329695175, "kl": 0.31640625, "learning_rate": 1.9952881867570536e-05, "loss": 0.0127, "num_tokens": 250821009.0, "reward": 1.7509765625, "reward_std": 0.18257087469100952, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.7275390625, "rewards/tag_count_reward/std": 0.08707331866025925, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12818980967824528, "frac_reward_zero_std": 0.0625, "grad_norm": 0.1483895842885732, "kl": 0.2724609375, "learning_rate": 1.995230241899396e-05, "loss": 0.0109, "num_tokens": 251389521.0, "reward": 1.732421875, "reward_std": 0.23199781775474548, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.708984375, "rewards/tag_count_reward/std": 0.1183105930685997, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1283605018349407, "frac_reward_zero_std": 0.125, "grad_norm": 0.15915402243968105, "kl": 0.3447265625, "learning_rate": 1.995171943770928e-05, "loss": 0.0138, "num_tokens": 251955745.0, "reward": 1.75, "reward_std": 0.20674991607666016, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.7109375, "rewards/tag_count_reward/std": 0.11897231638431549, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1285311939916361, "frac_reward_zero_std": 0.125, "grad_norm": 0.24023177692464678, "kl": 0.34375, "learning_rate": 1.9951132923923434e-05, "loss": 0.0138, "num_tokens": 252517825.0, "reward": 1.728515625, "reward_std": 0.23095373809337616, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.708984375, "rewards/tag_count_reward/std": 0.1183105930685997, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12870188614833147, "frac_reward_zero_std": 0.0625, "grad_norm": 0.1934500896611492, "kl": 0.31005859375, "learning_rate": 1.9950542877844617e-05, "loss": 0.0124, "num_tokens": 253077601.0, "reward": 1.69140625, "reward_std": 0.18009531497955322, "rewards/accuracy_reward/mean": 0.004166666883975267, "rewards/accuracy_reward/std": 0.06454972177743912, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.0990147590637207, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12887257830502688, "frac_reward_zero_std": 0.0625, "grad_norm": 0.35051240385718646, "kl": 0.30908203125, "learning_rate": 1.9949949299682277e-05, "loss": 0.0124, "num_tokens": 253641265.0, "reward": 1.7802734375, "reward_std": 0.28660112619400024, "rewards/accuracy_reward/mean": 0.09765625, "rewards/accuracy_reward/std": 0.29743078351020813, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.7177734375, "rewards/tag_count_reward/std": 0.10703913122415543, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1290432704617223, "frac_reward_zero_std": 0.25, "grad_norm": 0.29062567919137117, "kl": 0.400390625, "learning_rate": 1.9949352189647104e-05, "loss": 0.016, "num_tokens": 254201281.0, "reward": 1.697265625, "reward_std": 0.22434262931346893, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.94921875, "rewards/format_reward/std": 0.21998079121112823, "rewards/tag_count_reward/mean": 0.732421875, "rewards/tag_count_reward/std": 0.0867924839258194, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1292139626184177, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2049255224766715, "kl": 0.33984375, "learning_rate": 1.9948751547951052e-05, "loss": 0.0136, "num_tokens": 254768433.0, "reward": 1.677734375, "reward_std": 0.2756742835044861, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.9296875, "rewards/format_reward/std": 0.2561737895011902, "rewards/tag_count_reward/mean": 0.744140625, "rewards/tag_count_reward/std": 0.0910981148481369, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12938465477511307, "frac_reward_zero_std": 0.125, "grad_norm": 0.3654921583921139, "kl": 0.35302734375, "learning_rate": 1.9948147374807334e-05, "loss": 0.0141, "num_tokens": 255330385.0, "reward": 1.7333984375, "reward_std": 0.2574020028114319, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.93359375, "rewards/format_reward/std": 0.24947863817214966, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.08034826815128326, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12955534693180848, "frac_reward_zero_std": 0.3125, "grad_norm": 0.16703654125757672, "kl": 0.32861328125, "learning_rate": 1.99475396704304e-05, "loss": 0.0131, "num_tokens": 255893665.0, "reward": 1.7451171875, "reward_std": 0.21475835144519806, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.07723760604858398, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.12972603908850389, "frac_reward_zero_std": 0.25, "grad_norm": 0.2396102259840015, "kl": 0.38916015625, "learning_rate": 1.9946928435035976e-05, "loss": 0.0156, "num_tokens": 256460177.0, "reward": 1.73046875, "reward_std": 0.2363988608121872, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.08049707859754562, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1298967312451993, "frac_reward_zero_std": 0.375, "grad_norm": 0.23799478581318073, "kl": 0.3671875, "learning_rate": 1.9946313668841018e-05, "loss": 0.0147, "num_tokens": 257028193.0, "reward": 1.7919921875, "reward_std": 0.21133366227149963, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.7412109375, "rewards/tag_count_reward/std": 0.05117155611515045, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13006742340189467, "frac_reward_zero_std": 0.625, "grad_norm": 0.5568785171407197, "kl": 0.37744140625, "learning_rate": 1.9945695372063746e-05, "loss": 0.0151, "num_tokens": 257592129.0, "reward": 1.78125, "reward_std": 0.08900680392980576, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.06537505239248276, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13023811555859008, "frac_reward_zero_std": 0.5, "grad_norm": 0.1429147146723724, "kl": 0.34814453125, "learning_rate": 1.994507354492364e-05, "loss": 0.0139, "num_tokens": 258157041.0, "reward": 1.875, "reward_std": 0.125, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13040880771528549, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5615576925903194, "kl": 0.4140625, "learning_rate": 1.994444818764142e-05, "loss": 0.0166, "num_tokens": 258720097.0, "reward": 1.76171875, "reward_std": 0.12026958167552948, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.05366471782326698, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1305794998719809, "frac_reward_zero_std": 0.625, "grad_norm": 5.1506996564785315, "kl": 1.0048828125, "learning_rate": 1.9943819300439065e-05, "loss": 0.0402, "num_tokens": 259288033.0, "reward": 1.7587890625, "reward_std": 0.09545498341321945, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.7392578125, "rewards/tag_count_reward/std": 0.06364604830741882, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13075019202867627, "frac_reward_zero_std": 0.25, "grad_norm": 0.22483877354944606, "kl": 0.33203125, "learning_rate": 1.9943186883539817e-05, "loss": 0.0133, "num_tokens": 259857201.0, "reward": 1.845703125, "reward_std": 0.21758408844470978, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.736328125, "rewards/tag_count_reward/std": 0.06110577657818794, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13092088418537168, "frac_reward_zero_std": 0.125, "grad_norm": 1.1208487253103223, "kl": 0.3408203125, "learning_rate": 1.9942550937168147e-05, "loss": 0.0136, "num_tokens": 260418737.0, "reward": 1.8544921875, "reward_std": 0.19470898807048798, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.7333984375, "rewards/tag_count_reward/std": 0.07649025321006775, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13109157634206708, "frac_reward_zero_std": 0.375, "grad_norm": 5.556627836120876, "kl": 1.205078125, "learning_rate": 1.99419114615498e-05, "loss": 0.0482, "num_tokens": 260983377.0, "reward": 1.736328125, "reward_std": 0.15599173307418823, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.740234375, "rewards/tag_count_reward/std": 0.05334262177348137, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1312622684987625, "frac_reward_zero_std": 0.25, "grad_norm": 3.7595529006967165, "kl": 0.92529296875, "learning_rate": 1.994126845691177e-05, "loss": 0.037, "num_tokens": 261550545.0, "reward": 1.7392578125, "reward_std": 0.2180558741092682, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.7353515625, "rewards/tag_count_reward/std": 0.07688974589109421, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13143296065545787, "frac_reward_zero_std": 0.25, "grad_norm": 0.6897902495865468, "kl": 0.3408203125, "learning_rate": 1.9940621923482296e-05, "loss": 0.0136, "num_tokens": 262132369.0, "reward": 1.76953125, "reward_std": 0.24467390775680542, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.26394182443618774, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.08134892582893372, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13160365281215328, "frac_reward_zero_std": 0.1875, "grad_norm": 1.7148581659697233, "kl": 0.39453125, "learning_rate": 1.9939971861490874e-05, "loss": 0.0158, "num_tokens": 262693777.0, "reward": 1.6669921875, "reward_std": 0.24919307231903076, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26889389753341675, "rewards/tag_count_reward/mean": 0.7333984375, "rewards/tag_count_reward/std": 0.07963011413812637, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13177434496884868, "frac_reward_zero_std": 0.0, "grad_norm": 9.128274336913726, "kl": 1.93896484375, "learning_rate": 1.9939318271168253e-05, "loss": 0.0775, "num_tokens": 263257265.0, "reward": 1.6943359375, "reward_std": 0.31265658140182495, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24253563582897186, "rewards/tag_count_reward/mean": 0.7255859375, "rewards/tag_count_reward/std": 0.08933111280202866, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1319450371255441, "frac_reward_zero_std": 0.125, "grad_norm": 2.8215479557089056, "kl": 0.60791015625, "learning_rate": 1.9938661152746437e-05, "loss": 0.0243, "num_tokens": 263826369.0, "reward": 1.7138671875, "reward_std": 0.2374623417854309, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.22781464457511902, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.07060634344816208, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13211572928223947, "frac_reward_zero_std": 0.0625, "grad_norm": 1.1565509383616464, "kl": 0.98193359375, "learning_rate": 1.9938000506458676e-05, "loss": 0.0393, "num_tokens": 264393393.0, "reward": 1.685546875, "reward_std": 0.28005215525627136, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.92578125, "rewards/format_reward/std": 0.2626400291919708, "rewards/tag_count_reward/mean": 0.736328125, "rewards/tag_count_reward/std": 0.06866081804037094, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13228642143893488, "frac_reward_zero_std": 0.0625, "grad_norm": 1.281722477885657, "kl": 0.52197265625, "learning_rate": 1.9937336332539475e-05, "loss": 0.0209, "num_tokens": 264960833.0, "reward": 1.6884765625, "reward_std": 0.27743130922317505, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.92578125, "rewards/format_reward/std": 0.2626400291919708, "rewards/tag_count_reward/mean": 0.7314453125, "rewards/tag_count_reward/std": 0.06929237395524979, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13245711359563028, "frac_reward_zero_std": 0.0625, "grad_norm": 7.655275557530687, "kl": 1.69482421875, "learning_rate": 1.9936668631224593e-05, "loss": 0.0677, "num_tokens": 265536369.0, "reward": 1.6884765625, "reward_std": 0.2654553949832916, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26889389753341675, "rewards/tag_count_reward/mean": 0.7236328125, "rewards/tag_count_reward/std": 0.09149138629436493, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1326278057523257, "frac_reward_zero_std": 0.25, "grad_norm": 2.35927902367611, "kl": 0.57763671875, "learning_rate": 1.9935997402751043e-05, "loss": 0.0231, "num_tokens": 266105841.0, "reward": 1.736328125, "reward_std": 0.17628207802772522, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.732421875, "rewards/tag_count_reward/std": 0.07464683800935745, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13279849790902107, "frac_reward_zero_std": 0.3125, "grad_norm": 1.4824049849317251, "kl": 0.3173828125, "learning_rate": 1.9935322647357082e-05, "loss": 0.0127, "num_tokens": 266670673.0, "reward": 1.8115234375, "reward_std": 0.1820221245288849, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.0632839947938919, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13296919006571647, "frac_reward_zero_std": 0.3125, "grad_norm": 6.494037045980072, "kl": 1.40380859375, "learning_rate": 1.9934644365282224e-05, "loss": 0.056, "num_tokens": 267238881.0, "reward": 1.740234375, "reward_std": 0.13705813884735107, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.736328125, "rewards/tag_count_reward/std": 0.06499316543340683, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13313988222241188, "frac_reward_zero_std": 0.375, "grad_norm": 0.15805412278841863, "kl": 0.25927734375, "learning_rate": 1.993396255676724e-05, "loss": 0.0104, "num_tokens": 267802049.0, "reward": 1.7900390625, "reward_std": 0.1710512638092041, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.7392578125, "rewards/tag_count_reward/std": 0.0554114393889904, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1333105743791073, "frac_reward_zero_std": 0.3125, "grad_norm": 3.8476821438393722, "kl": 1.04150390625, "learning_rate": 1.993327722205414e-05, "loss": 0.0416, "num_tokens": 268372977.0, "reward": 1.8037109375, "reward_std": 0.16247692704200745, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2561737895011902, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.05928463488817215, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13348126653580267, "frac_reward_zero_std": 0.25, "grad_norm": 1.1793948941676942, "kl": 1.03271484375, "learning_rate": 1.9932588361386197e-05, "loss": 0.0414, "num_tokens": 268935153.0, "reward": 1.7529296875, "reward_std": 0.1738618016242981, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.7333984375, "rewards/tag_count_reward/std": 0.07321586459875107, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13365195869249807, "frac_reward_zero_std": 0.1875, "grad_norm": 0.189028675161215, "kl": 0.452880859375, "learning_rate": 1.9931895975007934e-05, "loss": 0.018, "num_tokens": 269498545.0, "reward": 1.728515625, "reward_std": 0.11954152584075928, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.732421875, "rewards/tag_count_reward/std": 0.06776251643896103, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13382265084919348, "frac_reward_zero_std": 0.375, "grad_norm": 8.874695607348235, "kl": 0.32421875, "learning_rate": 1.993120006316512e-05, "loss": 0.013, "num_tokens": 270062737.0, "reward": 1.7724609375, "reward_std": 0.20298613607883453, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.07060634344816208, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1339933430058889, "frac_reward_zero_std": 0.375, "grad_norm": 0.9556630402977965, "kl": 0.256591796875, "learning_rate": 1.993050062610478e-05, "loss": 0.0103, "num_tokens": 270631345.0, "reward": 1.775390625, "reward_std": 0.15972957015037537, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.740234375, "rewards/tag_count_reward/std": 0.05334262177348137, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13416403516258427, "frac_reward_zero_std": 0.1875, "grad_norm": 0.9116991586612729, "kl": 0.28173828125, "learning_rate": 1.992979766407519e-05, "loss": 0.0113, "num_tokens": 271192369.0, "reward": 1.787109375, "reward_std": 0.2272486835718155, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.736328125, "rewards/tag_count_reward/std": 0.07214225083589554, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13433472731927967, "frac_reward_zero_std": 0.25, "grad_norm": 3.4466078043127135, "kl": 1.559326171875, "learning_rate": 1.992909117732587e-05, "loss": 0.0623, "num_tokens": 271758017.0, "reward": 1.693359375, "reward_std": 0.24107451736927032, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.94140625, "rewards/format_reward/std": 0.23532284796237946, "rewards/tag_count_reward/mean": 0.732421875, "rewards/tag_count_reward/std": 0.0867924839258194, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13450541947597508, "frac_reward_zero_std": 0.25, "grad_norm": 0.13651985241893952, "kl": 0.621337890625, "learning_rate": 1.9928381166107605e-05, "loss": 0.0249, "num_tokens": 272316305.0, "reward": 1.693359375, "reward_std": 0.17230066657066345, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.20318391919136047, "rewards/tag_count_reward/mean": 0.736328125, "rewards/tag_count_reward/std": 0.09025342762470245, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1346761116326705, "frac_reward_zero_std": 0.0, "grad_norm": 0.7246774108034179, "kl": 0.341064453125, "learning_rate": 1.9927667630672417e-05, "loss": 0.0137, "num_tokens": 272880993.0, "reward": 1.705078125, "reward_std": 0.3350629210472107, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 0.91796875, "rewards/format_reward/std": 0.2749498784542084, "rewards/tag_count_reward/mean": 0.744140625, "rewards/tag_count_reward/std": 0.0661611557006836, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13484680378936587, "frac_reward_zero_std": 0.1875, "grad_norm": 2.8637933760955026, "kl": 0.881591796875, "learning_rate": 1.992695057127359e-05, "loss": 0.0353, "num_tokens": 273440081.0, "reward": 1.8056640625, "reward_std": 0.2185794562101364, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.2920515835285187, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.7470703125, "rewards/tag_count_reward/std": 0.05637072026729584, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13501749594606127, "frac_reward_zero_std": 0.25, "grad_norm": 0.14223402517425826, "kl": 0.207763671875, "learning_rate": 1.9926229988165657e-05, "loss": 0.0083, "num_tokens": 274006481.0, "reward": 1.7744140625, "reward_std": 0.24560454487800598, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.7470703125, "rewards/tag_count_reward/std": 0.05184073746204376, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13518818810275668, "frac_reward_zero_std": 0.375, "grad_norm": 3.9483989549550538, "kl": 0.45654296875, "learning_rate": 1.9925505881604388e-05, "loss": 0.0183, "num_tokens": 274574673.0, "reward": 1.8505859375, "reward_std": 0.15443843603134155, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.3222736418247223, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.7451171875, "rewards/tag_count_reward/std": 0.05169277638196945, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13535888025945209, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10552175982695895, "kl": 0.202392578125, "learning_rate": 1.992477825184682e-05, "loss": 0.0081, "num_tokens": 275139217.0, "reward": 1.798828125, "reward_std": 0.15556925535202026, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.748046875, "rewards/tag_count_reward/std": 0.03125, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13552957241614746, "frac_reward_zero_std": 0.5, "grad_norm": 0.21536002931522508, "kl": 0.39208984375, "learning_rate": 1.9924047099151244e-05, "loss": 0.0157, "num_tokens": 275699969.0, "reward": 1.8984375, "reward_std": 0.14204756915569305, "rewards/accuracy_reward/mean": 0.16015625, "rewards/accuracy_reward/std": 0.36746934056282043, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.048884619027376175, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13570026457284287, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10434607801463426, "kl": 0.20263671875, "learning_rate": 1.992331242377718e-05, "loss": 0.0081, "num_tokens": 276266225.0, "reward": 1.82421875, "reward_std": 0.12292127311229706, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.2920515835285187, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.048884619027376175, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13587095672953828, "frac_reward_zero_std": 0.375, "grad_norm": 2.8004707035203285, "kl": 0.430419921875, "learning_rate": 1.9922574225985416e-05, "loss": 0.0172, "num_tokens": 276827953.0, "reward": 1.8623046875, "reward_std": 0.20560979843139648, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.34422317147254944, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.7412109375, "rewards/tag_count_reward/std": 0.05575593560934067, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13604164888623368, "frac_reward_zero_std": 0.5, "grad_norm": 0.7661641690456122, "kl": 0.808837890625, "learning_rate": 1.9921832506037988e-05, "loss": 0.0324, "num_tokens": 277397777.0, "reward": 1.8251953125, "reward_std": 0.13941097259521484, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.7431640625, "rewards/tag_count_reward/std": 0.051470041275024414, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13621234104292906, "frac_reward_zero_std": 0.625, "grad_norm": 0.10142024344828111, "kl": 0.22705078125, "learning_rate": 1.9921087264198178e-05, "loss": 0.0091, "num_tokens": 277964177.0, "reward": 1.78125, "reward_std": 0.07433247566223145, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13638303319962447, "frac_reward_zero_std": 0.375, "grad_norm": 0.9783838751649524, "kl": 0.31591796875, "learning_rate": 1.9920338500730517e-05, "loss": 0.0126, "num_tokens": 278531985.0, "reward": 1.796875, "reward_std": 0.156251460313797, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.04358336701989174, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13655372535631988, "frac_reward_zero_std": 0.5625, "grad_norm": 1.4381037418716547, "kl": 0.451171875, "learning_rate": 1.9919586215900797e-05, "loss": 0.018, "num_tokens": 279102625.0, "reward": 1.796875, "reward_std": 0.11240890622138977, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.044107433408498764, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13672441751301528, "frac_reward_zero_std": 0.4375, "grad_norm": 0.17564534850785823, "kl": 0.50927734375, "learning_rate": 1.9918830409976044e-05, "loss": 0.0204, "num_tokens": 279678801.0, "reward": 1.7646484375, "reward_std": 0.1585611253976822, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.06704521179199219, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1368951096697107, "frac_reward_zero_std": 0.5, "grad_norm": 0.12896243324076795, "kl": 0.25634765625, "learning_rate": 1.9918071083224544e-05, "loss": 0.0102, "num_tokens": 280242241.0, "reward": 1.7939453125, "reward_std": 0.1576441079378128, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.7431640625, "rewards/tag_count_reward/std": 0.040850620716810226, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13706580182640607, "frac_reward_zero_std": 0.375, "grad_norm": 0.19305733308665288, "kl": 0.29052734375, "learning_rate": 1.9917308235915832e-05, "loss": 0.0116, "num_tokens": 280799697.0, "reward": 1.751953125, "reward_std": 0.12128816545009613, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.736328125, "rewards/tag_count_reward/std": 0.056953661143779755, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13723649398310148, "frac_reward_zero_std": 0.1875, "grad_norm": 0.5236390428952392, "kl": 0.2900390625, "learning_rate": 1.991654186832069e-05, "loss": 0.0116, "num_tokens": 281361233.0, "reward": 1.7490234375, "reward_std": 0.18787196278572083, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.7333984375, "rewards/tag_count_reward/std": 0.06618285179138184, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13740718613979688, "frac_reward_zero_std": 0.1875, "grad_norm": 0.1759233160279489, "kl": 0.294921875, "learning_rate": 1.9915771980711156e-05, "loss": 0.0118, "num_tokens": 281932881.0, "reward": 1.802734375, "reward_std": 0.24778631329536438, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.2920515835285187, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.744140625, "rewards/tag_count_reward/std": 0.03789619356393814, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1375778782964923, "frac_reward_zero_std": 0.4375, "grad_norm": 0.14886617362327487, "kl": 0.28564453125, "learning_rate": 1.99149985733605e-05, "loss": 0.0114, "num_tokens": 282500817.0, "reward": 1.8134765625, "reward_std": 0.16315355896949768, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.7431640625, "rewards/tag_count_reward/std": 0.051470041275024414, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13774857045318767, "frac_reward_zero_std": 0.5, "grad_norm": 0.13754527146092724, "kl": 0.27099609375, "learning_rate": 1.9914221646543268e-05, "loss": 0.0108, "num_tokens": 283066481.0, "reward": 1.7998046875, "reward_std": 0.15229685604572296, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2749498784542084, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.7490234375, "rewards/tag_count_reward/std": 0.015625, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13791926260988308, "frac_reward_zero_std": 0.375, "grad_norm": 0.15712873859250462, "kl": 0.28271484375, "learning_rate": 1.991344120053523e-05, "loss": 0.0113, "num_tokens": 283631329.0, "reward": 1.869140625, "reward_std": 0.2214145064353943, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3483152687549591, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.744140625, "rewards/tag_count_reward/std": 0.03789619356393814, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13808995476657848, "frac_reward_zero_std": 0.25, "grad_norm": 88.22305283566813, "kl": 13.0830078125, "learning_rate": 1.9912657235613425e-05, "loss": 0.5237, "num_tokens": 284196177.0, "reward": 1.806640625, "reward_std": 0.17330113053321838, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.744140625, "rewards/tag_count_reward/std": 0.03789619356393814, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1382606469232739, "frac_reward_zero_std": 0.5, "grad_norm": 0.1324062963091881, "kl": 0.2802734375, "learning_rate": 1.9911869752056122e-05, "loss": 0.0112, "num_tokens": 284762001.0, "reward": 1.798828125, "reward_std": 0.11582310497760773, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22781464457511902, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.744140625, "rewards/tag_count_reward/std": 0.03789619356393814, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13843133907996927, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13040515015917356, "kl": 0.2763671875, "learning_rate": 1.9911078750142857e-05, "loss": 0.011, "num_tokens": 285328177.0, "reward": 1.7744140625, "reward_std": 0.12477175146341324, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.7431640625, "rewards/tag_count_reward/std": 0.040850620716810226, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13860203123666467, "frac_reward_zero_std": 0.1875, "grad_norm": 0.7198191213263033, "kl": 0.38330078125, "learning_rate": 1.9910284230154408e-05, "loss": 0.0154, "num_tokens": 285890129.0, "reward": 1.86328125, "reward_std": 0.2497934252023697, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3483152687549591, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.057389069348573685, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13877272339336008, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1496596546909635, "kl": 0.29296875, "learning_rate": 1.9909486192372795e-05, "loss": 0.0117, "num_tokens": 286461537.0, "reward": 1.7490234375, "reward_std": 0.14155305922031403, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.0632839947938919, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1389434155500555, "frac_reward_zero_std": 0.375, "grad_norm": 0.13464636411656045, "kl": 0.27294921875, "learning_rate": 1.9908684637081297e-05, "loss": 0.0109, "num_tokens": 287037921.0, "reward": 1.7431640625, "reward_std": 0.1320660263299942, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20024390518665314, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.7275390625, "rewards/tag_count_reward/std": 0.09768597036600113, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13911410770675087, "frac_reward_zero_std": 0.3125, "grad_norm": 42.09290752318986, "kl": 7.2099609375, "learning_rate": 1.9907879564564436e-05, "loss": 0.2887, "num_tokens": 287613809.0, "reward": 1.8388671875, "reward_std": 0.20180334150791168, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3268752694129944, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.06704521179199219, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13928479986344627, "frac_reward_zero_std": 0.375, "grad_norm": 2.9401738122800563, "kl": 0.7490234375, "learning_rate": 1.9907070975107984e-05, "loss": 0.03, "num_tokens": 288181233.0, "reward": 1.736328125, "reward_std": 0.16280615329742432, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.748046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13945549202014168, "frac_reward_zero_std": 0.1875, "grad_norm": 1.378237973779549, "kl": 0.63037109375, "learning_rate": 1.9906258868998956e-05, "loss": 0.0252, "num_tokens": 288747137.0, "reward": 1.7421875, "reward_std": 0.30549630522727966, "rewards/accuracy_reward/mean": 0.08203125, "rewards/accuracy_reward/std": 0.2749498784542084, "rewards/format_reward/mean": 0.91796875, "rewards/format_reward/std": 0.2749498784542084, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.04358336701989174, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1396261841768371, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3001670581135097, "kl": 0.67724609375, "learning_rate": 1.9905443246525628e-05, "loss": 0.027, "num_tokens": 289312209.0, "reward": 1.759765625, "reward_std": 0.25851500034332275, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 0.94140625, "rewards/format_reward/std": 0.23532284796237946, "rewards/tag_count_reward/mean": 0.744140625, "rewards/tag_count_reward/std": 0.03789619356393814, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13979687633353247, "frac_reward_zero_std": 0.25, "grad_norm": 0.3272026382597326, "kl": 0.2919921875, "learning_rate": 1.9904624107977515e-05, "loss": 0.0117, "num_tokens": 289884369.0, "reward": 1.70703125, "reward_std": 0.23227009177207947, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.28082075715065, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.044107433408498764, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13996756849022787, "frac_reward_zero_std": 0.375, "grad_norm": 2.250233930234853, "kl": 0.330078125, "learning_rate": 1.9903801453645378e-05, "loss": 0.0132, "num_tokens": 290457201.0, "reward": 1.7431640625, "reward_std": 0.19137868285179138, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.7431640625, "rewards/tag_count_reward/std": 0.051470041275024414, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14013826064692328, "frac_reward_zero_std": 0.1875, "grad_norm": 258.7174095568163, "kl": 22.7138671875, "learning_rate": 1.9902975283821232e-05, "loss": 0.9048, "num_tokens": 291021329.0, "reward": 1.779296875, "reward_std": 0.2574232518672943, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.740234375, "rewards/tag_count_reward/std": 0.05334262177348137, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1403089528036187, "frac_reward_zero_std": 0.5, "grad_norm": 0.5119412653601303, "kl": 0.3583984375, "learning_rate": 1.9902145598798338e-05, "loss": 0.0144, "num_tokens": 291588289.0, "reward": 1.869140625, "reward_std": 0.16680088639259338, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.744140625, "rewards/tag_count_reward/std": 0.04388983175158501, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14047964496031407, "frac_reward_zero_std": 0.5, "grad_norm": 0.234011578419271, "kl": 0.36083984375, "learning_rate": 1.9901312398871205e-05, "loss": 0.0144, "num_tokens": 292155105.0, "reward": 1.7841796875, "reward_std": 0.1229480504989624, "rewards/accuracy_reward/mean": 0.05833333358168602, "rewards/accuracy_reward/std": 0.23486249148845673, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.7451171875, "rewards/tag_count_reward/std": 0.041130900382995605, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14065033711700947, "frac_reward_zero_std": 0.1875, "grad_norm": 0.9588428446334432, "kl": 0.5869140625, "learning_rate": 1.9900475684335582e-05, "loss": 0.0235, "num_tokens": 292719729.0, "reward": 1.8369140625, "reward_std": 0.20139814913272858, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3268752694129944, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.7392578125, "rewards/tag_count_reward/std": 0.0554114393889904, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14082102927370488, "frac_reward_zero_std": 0.3125, "grad_norm": 1.3677329554416693, "kl": 0.7138671875, "learning_rate": 1.9899635455488482e-05, "loss": 0.0286, "num_tokens": 293278929.0, "reward": 1.703125, "reward_std": 0.2062460482120514, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.22781464457511902, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.06824120879173279, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14099172143040029, "frac_reward_zero_std": 0.3125, "grad_norm": 3.0365403438508354, "kl": 0.8369140625, "learning_rate": 1.989879171262815e-05, "loss": 0.0335, "num_tokens": 293842785.0, "reward": 1.8251953125, "reward_std": 0.20856858789920807, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.7431640625, "rewards/tag_count_reward/std": 0.0464647077023983, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14116241358709566, "frac_reward_zero_std": 0.375, "grad_norm": 1.9234797101952745, "kl": 0.7734375, "learning_rate": 1.9897944456054087e-05, "loss": 0.0309, "num_tokens": 294413505.0, "reward": 1.7255859375, "reward_std": 0.10461903363466263, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.06704521179199219, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14133310574379107, "frac_reward_zero_std": 0.3125, "grad_norm": 1.3079361797602116, "kl": 0.68359375, "learning_rate": 1.9897093686067035e-05, "loss": 0.0274, "num_tokens": 294984513.0, "reward": 1.7763671875, "reward_std": 0.15199127793312073, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.05928463488817215, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14150379790048648, "frac_reward_zero_std": 0.3125, "grad_norm": 0.20993739909463568, "kl": 0.357421875, "learning_rate": 1.989623940296899e-05, "loss": 0.0143, "num_tokens": 295551361.0, "reward": 1.7568359375, "reward_std": 0.09069855511188507, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7333984375, "rewards/tag_count_reward/std": 0.062369659543037415, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14167449005718188, "frac_reward_zero_std": 0.3125, "grad_norm": 0.1974136316134969, "kl": 0.37744140625, "learning_rate": 1.9895381607063195e-05, "loss": 0.0151, "num_tokens": 296112721.0, "reward": 1.8447265625, "reward_std": 0.14533992111682892, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7314453125, "rewards/tag_count_reward/std": 0.06929237395524979, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14184518221387726, "frac_reward_zero_std": 0.125, "grad_norm": 0.9474320971883574, "kl": 0.458984375, "learning_rate": 1.9894520298654126e-05, "loss": 0.0183, "num_tokens": 296676561.0, "reward": 1.7890625, "reward_std": 0.21913115680217743, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.06454972177743912, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14201587437057267, "frac_reward_zero_std": 0.25, "grad_norm": 0.25682000097183255, "kl": 0.39599609375, "learning_rate": 1.9893655478047526e-05, "loss": 0.0159, "num_tokens": 297244641.0, "reward": 1.7412109375, "reward_std": 0.19579851627349854, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.7451171875, "rewards/tag_count_reward/std": 0.05169277638196945, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14218656652726808, "frac_reward_zero_std": 0.0, "grad_norm": 0.1638551989367017, "kl": 0.28564453125, "learning_rate": 1.9892787145550372e-05, "loss": 0.0114, "num_tokens": 297811025.0, "reward": 1.673828125, "reward_std": 0.32697010040283203, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.90234375, "rewards/format_reward/std": 0.29743078351020813, "rewards/tag_count_reward/mean": 0.744140625, "rewards/tag_count_reward/std": 0.05828297883272171, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14235725868396348, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2249881417185162, "kl": 0.265625, "learning_rate": 1.989191530147089e-05, "loss": 0.0106, "num_tokens": 298370657.0, "reward": 1.78125, "reward_std": 0.4124031662940979, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 0.8828125, "rewards/format_reward/std": 0.3222736418247223, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.07301289588212967, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14252795084065886, "frac_reward_zero_std": 0.1875, "grad_norm": 0.21276327182542698, "kl": 0.28271484375, "learning_rate": 1.989103994611855e-05, "loss": 0.0113, "num_tokens": 298936833.0, "reward": 1.6435546875, "reward_std": 0.2892402708530426, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.88671875, "rewards/format_reward/std": 0.31755712628364563, "rewards/tag_count_reward/mean": 0.7451171875, "rewards/tag_count_reward/std": 0.06043620780110359, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14269864299735427, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2307649157666903, "kl": 0.7119140625, "learning_rate": 1.989016107980408e-05, "loss": 0.0285, "num_tokens": 299500673.0, "reward": 1.62890625, "reward_std": 0.2877618074417114, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.8984375, "rewards/format_reward/std": 0.3026638329029083, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.09450270235538483, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14286933515404968, "frac_reward_zero_std": 0.125, "grad_norm": 0.11013078336734448, "kl": 0.204833984375, "learning_rate": 1.9889278702839438e-05, "loss": 0.0082, "num_tokens": 300061601.0, "reward": 1.8115234375, "reward_std": 0.21569252014160156, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3268752694129944, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.20318391919136047, "rewards/tag_count_reward/mean": 0.7333984375, "rewards/tag_count_reward/std": 0.07963011413812637, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14304002731074508, "frac_reward_zero_std": 0.0625, "grad_norm": 0.1409081278896753, "kl": 0.22705078125, "learning_rate": 1.988839281553784e-05, "loss": 0.0091, "num_tokens": 300626945.0, "reward": 1.7275390625, "reward_std": 0.17998768389225006, "rewards/accuracy_reward/mean": 0.02734375, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.7158203125, "rewards/tag_count_reward/std": 0.08885829150676727, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14321071946744046, "frac_reward_zero_std": 0.25, "grad_norm": 0.11447767978728374, "kl": 0.20654296875, "learning_rate": 1.9887503418213746e-05, "loss": 0.0083, "num_tokens": 301192913.0, "reward": 1.767578125, "reward_std": 0.1806841641664505, "rewards/accuracy_reward/mean": 0.05000000074505806, "rewards/accuracy_reward/std": 0.21840041875839233, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.740234375, "rewards/tag_count_reward/std": 0.06932690739631653, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14338141162413587, "frac_reward_zero_std": 0.0, "grad_norm": 0.11679706530271229, "kl": 0.196533203125, "learning_rate": 1.9886610511182853e-05, "loss": 0.0079, "num_tokens": 301765601.0, "reward": 1.7177734375, "reward_std": 0.23180001974105835, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21178513765335083, "rewards/tag_count_reward/mean": 0.7333984375, "rewards/tag_count_reward/std": 0.07963011413812637, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14355210378083127, "frac_reward_zero_std": 0.375, "grad_norm": 0.11914682196759367, "kl": 0.23046875, "learning_rate": 1.9885714094762116e-05, "loss": 0.0092, "num_tokens": 302335153.0, "reward": 1.7841796875, "reward_std": 0.1147407740354538, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21998079121112823, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.05928463488817215, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14372279593752668, "frac_reward_zero_std": 0.5, "grad_norm": 1.0457283258417267, "kl": 0.482177734375, "learning_rate": 1.9884814169269727e-05, "loss": 0.0193, "num_tokens": 302904593.0, "reward": 1.8310546875, "reward_std": 0.10531135648488998, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3077581524848938, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.7412109375, "rewards/tag_count_reward/std": 0.05117155611515045, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14389348809422206, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6074930832361028, "kl": 0.40087890625, "learning_rate": 1.988391073502513e-05, "loss": 0.016, "num_tokens": 303468913.0, "reward": 1.796875, "reward_std": 0.17510941624641418, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.04358336701989174, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14406418025091747, "frac_reward_zero_std": 0.3125, "grad_norm": 0.270723128406654, "kl": 0.439453125, "learning_rate": 1.9883003792349013e-05, "loss": 0.0176, "num_tokens": 304036897.0, "reward": 1.7685546875, "reward_std": 0.18710127472877502, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.7490234375, "rewards/tag_count_reward/std": 0.051914554089307785, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14423487240761287, "frac_reward_zero_std": 0.25, "grad_norm": 0.33676629725053575, "kl": 0.351318359375, "learning_rate": 1.9882093341563307e-05, "loss": 0.014, "num_tokens": 304600961.0, "reward": 1.7685546875, "reward_std": 0.2624933123588562, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.22781464457511902, "rewards/tag_count_reward/mean": 0.7451171875, "rewards/tag_count_reward/std": 0.06436405330896378, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14440556456430828, "frac_reward_zero_std": 0.25, "grad_norm": 0.40854630230432604, "kl": 0.54248046875, "learning_rate": 1.9881179382991186e-05, "loss": 0.0217, "num_tokens": 305169041.0, "reward": 1.75390625, "reward_std": 0.22571304440498352, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.0625, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14457625672100366, "frac_reward_zero_std": 0.3125, "grad_norm": 0.5677456322036526, "kl": 0.50341796875, "learning_rate": 1.9880261916957072e-05, "loss": 0.0201, "num_tokens": 305733025.0, "reward": 1.8173828125, "reward_std": 0.20800702273845673, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3077581524848938, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.7431640625, "rewards/tag_count_reward/std": 0.05603000521659851, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14474694887769907, "frac_reward_zero_std": 0.4375, "grad_norm": 1.2361474027251491, "kl": 0.6884765625, "learning_rate": 1.9879340943786635e-05, "loss": 0.0276, "num_tokens": 306299377.0, "reward": 1.78125, "reward_std": 0.14309704303741455, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.05805254727602005, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14491764103439447, "frac_reward_zero_std": 0.3125, "grad_norm": 0.22642057261709886, "kl": 0.47509765625, "learning_rate": 1.9878416463806788e-05, "loss": 0.019, "num_tokens": 306874273.0, "reward": 1.7314453125, "reward_std": 0.13791170716285706, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.7392578125, "rewards/tag_count_reward/std": 0.06364604830741882, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14508833319108988, "frac_reward_zero_std": 0.125, "grad_norm": 0.8128133815404788, "kl": 0.60888671875, "learning_rate": 1.9877488477345686e-05, "loss": 0.0244, "num_tokens": 307438209.0, "reward": 1.7607421875, "reward_std": 0.22133946418762207, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.7333984375, "rewards/tag_count_reward/std": 0.08556486666202545, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14525902534778526, "frac_reward_zero_std": 0.0, "grad_norm": 1.7014279568209298, "kl": 0.49658203125, "learning_rate": 1.987655698473273e-05, "loss": 0.0198, "num_tokens": 308006289.0, "reward": 1.755859375, "reward_std": 0.2631751596927643, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2561737895011902, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.720703125, "rewards/tag_count_reward/std": 0.09193504601716995, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14542971750448067, "frac_reward_zero_std": 0.0, "grad_norm": 0.17058040381111036, "kl": 0.44140625, "learning_rate": 1.987562198629857e-05, "loss": 0.0177, "num_tokens": 308568001.0, "reward": 1.748046875, "reward_std": 0.22438040375709534, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.2769629955291748, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.708984375, "rewards/tag_count_reward/std": 0.11409208923578262, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14560040966117607, "frac_reward_zero_std": 0.0625, "grad_norm": 1.365600941741843, "kl": 0.395263671875, "learning_rate": 1.9874683482375094e-05, "loss": 0.0158, "num_tokens": 309136769.0, "reward": 1.798828125, "reward_std": 0.2252977192401886, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3077581524848938, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.712890625, "rewards/tag_count_reward/std": 0.10189308226108551, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14577110181787148, "frac_reward_zero_std": 0.25, "grad_norm": 0.7979769072951628, "kl": 0.4453125, "learning_rate": 1.987374147329544e-05, "loss": 0.0178, "num_tokens": 309700769.0, "reward": 1.7265625, "reward_std": 0.14077717065811157, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.07739239931106567, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14594179397456686, "frac_reward_zero_std": 0.1875, "grad_norm": 0.7907383495516737, "kl": 0.3984375, "learning_rate": 1.987279595939398e-05, "loss": 0.0159, "num_tokens": 310263569.0, "reward": 1.8125, "reward_std": 0.18752454221248627, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.07174300402402878, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14611248613126226, "frac_reward_zero_std": 0.25, "grad_norm": 0.5636336027143111, "kl": 0.33154296875, "learning_rate": 1.9871846941006344e-05, "loss": 0.0133, "num_tokens": 310825761.0, "reward": 1.83984375, "reward_std": 0.21351079642772675, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.044107433408498764, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14628317828795767, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12051532425199002, "kl": 0.45703125, "learning_rate": 1.9870894418469394e-05, "loss": 0.0183, "num_tokens": 311396145.0, "reward": 1.74609375, "reward_std": 0.08331455290317535, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.04358336701989174, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14645387044465308, "frac_reward_zero_std": 0.375, "grad_norm": 0.17033448499655593, "kl": 0.53271484375, "learning_rate": 1.986993839212125e-05, "loss": 0.0213, "num_tokens": 311959537.0, "reward": 1.7744140625, "reward_std": 0.13587257266044617, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.7392578125, "rewards/tag_count_reward/std": 0.0554114393889904, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14662456260134846, "frac_reward_zero_std": 0.4375, "grad_norm": 1.413939060037888, "kl": 0.46142578125, "learning_rate": 1.9868978862301257e-05, "loss": 0.0185, "num_tokens": 312525585.0, "reward": 1.875, "reward_std": 0.15577185153961182, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03814799711108208, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14679525475804386, "frac_reward_zero_std": 0.3125, "grad_norm": 2.9227970223917494, "kl": 1.27734375, "learning_rate": 1.9868015829350018e-05, "loss": 0.0511, "num_tokens": 313102161.0, "reward": 1.830078125, "reward_std": 0.1565839648246765, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3077581524848938, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.744140625, "rewards/tag_count_reward/std": 0.04388983175158501, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14696594691473927, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3233188047460361, "kl": 1.140625, "learning_rate": 1.9867049293609374e-05, "loss": 0.0455, "num_tokens": 313668049.0, "reward": 1.798828125, "reward_std": 0.12217988073825836, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.744140625, "rewards/tag_count_reward/std": 0.04388983175158501, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14713663907143468, "frac_reward_zero_std": 0.625, "grad_norm": 0.5971783721738612, "kl": 1.076171875, "learning_rate": 1.986607925542241e-05, "loss": 0.043, "num_tokens": 314238481.0, "reward": 1.880859375, "reward_std": 0.11512990295886993, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3483152687549591, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.748046875, "rewards/tag_count_reward/std": 0.03125, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14730733122813006, "frac_reward_zero_std": 0.4375, "grad_norm": 1.0227482778786319, "kl": 0.751953125, "learning_rate": 1.9865105715133462e-05, "loss": 0.0301, "num_tokens": 314805585.0, "reward": 1.7841796875, "reward_std": 0.1905418336391449, "rewards/accuracy_reward/mean": 0.07083333283662796, "rewards/accuracy_reward/std": 0.25708237290382385, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.7451171875, "rewards/tag_count_reward/std": 0.078125, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14747802338482546, "frac_reward_zero_std": 0.5, "grad_norm": 1.4989354254103295, "kl": 0.44873046875, "learning_rate": 1.986412867308809e-05, "loss": 0.0179, "num_tokens": 315381841.0, "reward": 1.8291015625, "reward_std": 0.17929750680923462, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.3222736418247223, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.7431640625, "rewards/tag_count_reward/std": 0.051470041275024414, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14764871554152087, "frac_reward_zero_std": 0.5, "grad_norm": 0.7974163087199583, "kl": 0.35400390625, "learning_rate": 1.986314812963311e-05, "loss": 0.0142, "num_tokens": 315948785.0, "reward": 1.7587890625, "reward_std": 0.14899903535842896, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.7470703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14781940769821628, "frac_reward_zero_std": 0.25, "grad_norm": 0.18522942390532884, "kl": 0.3466796875, "learning_rate": 1.986216408511659e-05, "loss": 0.0139, "num_tokens": 316518737.0, "reward": 1.7685546875, "reward_std": 0.20305541157722473, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.7412109375, "rewards/tag_count_reward/std": 0.05999100208282471, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14799009985491166, "frac_reward_zero_std": 0.4375, "grad_norm": 0.16956939568111074, "kl": 0.35595703125, "learning_rate": 1.986117653988782e-05, "loss": 0.0142, "num_tokens": 317087793.0, "reward": 1.77734375, "reward_std": 0.1077437624335289, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.04358336701989174, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14816079201160706, "frac_reward_zero_std": 0.3125, "grad_norm": 153.43745367966977, "kl": 20.908203125, "learning_rate": 1.9860185494297348e-05, "loss": 0.8359, "num_tokens": 317649905.0, "reward": 1.740234375, "reward_std": 0.11457280814647675, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.740234375, "rewards/tag_count_reward/std": 0.05334262177348137, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14833148416830247, "frac_reward_zero_std": 0.75, "grad_norm": 0.11065362308518505, "kl": 0.38330078125, "learning_rate": 1.9859190948696952e-05, "loss": 0.0153, "num_tokens": 318216033.0, "reward": 1.7265625, "reward_std": 0.078809455037117, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14850217632499788, "frac_reward_zero_std": 0.3125, "grad_norm": 24.877300690883274, "kl": 5.1474609375, "learning_rate": 1.9858192903439674e-05, "loss": 0.2057, "num_tokens": 318783457.0, "reward": 1.77734375, "reward_std": 0.21906989812850952, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.2920515835285187, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.20318391919136047, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.08539126068353653, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14867286848169325, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3767566092103178, "kl": 0.46875, "learning_rate": 1.985719135887977e-05, "loss": 0.0187, "num_tokens": 319347537.0, "reward": 1.6171875, "reward_std": 0.43036872148513794, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33136674761772156, "rewards/tag_count_reward/mean": 0.69921875, "rewards/tag_count_reward/std": 0.11444398760795593, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14884356063838866, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3174891480570158, "kl": 0.6171875, "learning_rate": 1.985618631537276e-05, "loss": 0.0247, "num_tokens": 319915297.0, "reward": 1.7705078125, "reward_std": 0.260159969329834, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.7158203125, "rewards/tag_count_reward/std": 0.09421352297067642, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14901425279508407, "frac_reward_zero_std": 0.0, "grad_norm": 0.9391592967741187, "kl": 0.9169921875, "learning_rate": 1.9855177773275395e-05, "loss": 0.0368, "num_tokens": 320482113.0, "reward": 1.75390625, "reward_std": 0.246629536151886, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24256734549999237, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.07629599422216415, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14918494495177947, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9752982634917597, "kl": 0.7431640625, "learning_rate": 1.9854165732945674e-05, "loss": 0.0297, "num_tokens": 321051441.0, "reward": 1.845703125, "reward_std": 0.22222383320331573, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.3600577116012573, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.728515625, "rewards/tag_count_reward/std": 0.08300333470106125, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14935563710847485, "frac_reward_zero_std": 0.1875, "grad_norm": 2.2946663072000013, "kl": 0.7470703125, "learning_rate": 1.985315019474283e-05, "loss": 0.0299, "num_tokens": 321630289.0, "reward": 1.6767578125, "reward_std": 0.2563677430152893, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.93359375, "rewards/format_reward/std": 0.24947863817214966, "rewards/tag_count_reward/mean": 0.7197265625, "rewards/tag_count_reward/std": 0.09294486790895462, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14952632926517026, "frac_reward_zero_std": 0.1875, "grad_norm": 1.3520625693962105, "kl": 0.45068359375, "learning_rate": 1.9852131159027347e-05, "loss": 0.018, "num_tokens": 322190817.0, "reward": 1.7529296875, "reward_std": 0.3126060366630554, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.9296875, "rewards/format_reward/std": 0.2561737895011902, "rewards/tag_count_reward/mean": 0.7333984375, "rewards/tag_count_reward/std": 0.08556486666202545, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14969702142186567, "frac_reward_zero_std": 0.0, "grad_norm": 0.6018790831244496, "kl": 0.4228515625, "learning_rate": 1.9851108626160943e-05, "loss": 0.0169, "num_tokens": 322752753.0, "reward": 1.625, "reward_std": 0.3542720675468445, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.8671875, "rewards/format_reward/std": 0.3400367796421051, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.07827804237604141, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.14986771357856107, "frac_reward_zero_std": 0.0625, "grad_norm": 0.763055981593738, "kl": 0.4326171875, "learning_rate": 1.985008259650658e-05, "loss": 0.0173, "num_tokens": 323316113.0, "reward": 1.6650390625, "reward_std": 0.2759276330471039, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2920515835285187, "rewards/tag_count_reward/mean": 0.7470703125, "rewards/tag_count_reward/std": 0.06817805022001266, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15003840573525645, "frac_reward_zero_std": 0.125, "grad_norm": 0.8094430747590077, "kl": 0.375, "learning_rate": 1.9849053070428466e-05, "loss": 0.015, "num_tokens": 323882049.0, "reward": 1.8095703125, "reward_std": 0.25366663932800293, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 0.94921875, "rewards/format_reward/std": 0.21998079121112823, "rewards/tag_count_reward/mean": 0.7470703125, "rewards/tag_count_reward/std": 0.06448294222354889, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15020909789195186, "frac_reward_zero_std": 0.25, "grad_norm": 0.40191835124812697, "kl": 0.3701171875, "learning_rate": 1.9848020048292044e-05, "loss": 0.0148, "num_tokens": 324451329.0, "reward": 1.791015625, "reward_std": 0.2199457585811615, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.2920515835285187, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.20318391919136047, "rewards/tag_count_reward/mean": 0.740234375, "rewards/tag_count_reward/std": 0.06185327470302582, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15037979004864727, "frac_reward_zero_std": 0.1875, "grad_norm": 0.4264536958795337, "kl": 0.3681640625, "learning_rate": 1.9846983530463994e-05, "loss": 0.0147, "num_tokens": 325013009.0, "reward": 1.7998046875, "reward_std": 0.1750878095626831, "rewards/accuracy_reward/mean": 0.06640625, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7333984375, "rewards/tag_count_reward/std": 0.07321586459875107, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15055048220534267, "frac_reward_zero_std": 0.25, "grad_norm": 0.7885292745518017, "kl": 0.4140625, "learning_rate": 1.9845943517312247e-05, "loss": 0.0165, "num_tokens": 325573649.0, "reward": 1.7998046875, "reward_std": 0.20374208688735962, "rewards/accuracy_reward/mean": 0.07421875, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.07399629801511765, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15072117436203805, "frac_reward_zero_std": 0.3125, "grad_norm": 1.8075346359136286, "kl": 0.43701171875, "learning_rate": 1.984490000920597e-05, "loss": 0.0175, "num_tokens": 326149921.0, "reward": 1.7822265625, "reward_std": 0.11398105323314667, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.7353515625, "rewards/tag_count_reward/std": 0.07022564113140106, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15089186651873346, "frac_reward_zero_std": 0.3125, "grad_norm": 1.7156749141447039, "kl": 0.48681640625, "learning_rate": 1.984385300651557e-05, "loss": 0.0194, "num_tokens": 326712529.0, "reward": 1.8076171875, "reward_std": 0.1275406777858734, "rewards/accuracy_reward/mean": 0.07500000298023224, "rewards/accuracy_reward/std": 0.26394182443618774, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.7490234375, "rewards/tag_count_reward/std": 0.056438617408275604, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15106255867542887, "frac_reward_zero_std": 0.0, "grad_norm": 4.48341639292765, "kl": 0.6533203125, "learning_rate": 1.9842802509612695e-05, "loss": 0.0261, "num_tokens": 327279057.0, "reward": 1.810546875, "reward_std": 0.22065824270248413, "rewards/accuracy_reward/mean": 0.05078125, "rewards/accuracy_reward/std": 0.21998079121112823, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.791015625, "rewards/tag_count_reward/std": 0.12238377332687378, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15123325083212427, "frac_reward_zero_std": 0.0, "grad_norm": 1.3473185343208516, "kl": 0.9677734375, "learning_rate": 1.9841748518870233e-05, "loss": 0.0387, "num_tokens": 327849633.0, "reward": 1.908203125, "reward_std": 0.2627088725566864, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.94921875, "rewards/format_reward/std": 0.21998079121112823, "rewards/tag_count_reward/mean": 0.896484375, "rewards/tag_count_reward/std": 0.15348036587238312, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15140394298881965, "frac_reward_zero_std": 0.0, "grad_norm": 10.110295965881193, "kl": 1.0078125, "learning_rate": 1.9840691034662316e-05, "loss": 0.0403, "num_tokens": 328418577.0, "reward": 1.8623046875, "reward_std": 0.2247655987739563, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.8818359375, "rewards/tag_count_reward/std": 0.13266554474830627, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15157463514551506, "frac_reward_zero_std": 0.0, "grad_norm": 2.304532146785526, "kl": 2.66015625, "learning_rate": 1.9839630057364308e-05, "loss": 0.1065, "num_tokens": 328982049.0, "reward": 1.8681640625, "reward_std": 0.4297429323196411, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3638034462928772, "rewards/tag_count_reward/mean": 0.9462890625, "rewards/tag_count_reward/std": 0.12444346398115158, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1260.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 625.0, "completions/min_terminated_length": 0.0, "epoch": 0.15174532730221046, "frac_reward_zero_std": 0.0, "grad_norm": 0.38959856689984, "kl": 3.36328125, "learning_rate": 1.983856558735282e-05, "loss": 0.1345, "num_tokens": 329345649.0, "reward": 1.9599609375, "reward_std": 0.23182368278503418, "rewards/accuracy_reward/mean": 0.02500000037252903, "rewards/accuracy_reward/std": 0.1564512401819229, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.9677734375, "rewards/tag_count_reward/std": 0.09490203112363815, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15191601945890587, "frac_reward_zero_std": 0.0, "grad_norm": 0.22855012244394673, "kl": 1.01953125, "learning_rate": 1.9837497625005703e-05, "loss": 0.0407, "num_tokens": 329911537.0, "reward": 1.8818359375, "reward_std": 0.28788891434669495, "rewards/accuracy_reward/mean": 0.05859375, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.8623046875, "rewards/tag_count_reward/std": 0.1429210752248764, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15208671161560125, "frac_reward_zero_std": 0.0625, "grad_norm": 0.13614177833375138, "kl": 2.68359375, "learning_rate": 1.983642617070204e-05, "loss": 0.1074, "num_tokens": 330474897.0, "reward": 1.93359375, "reward_std": 0.20012784004211426, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.11042476445436478, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15225740377229666, "frac_reward_zero_std": 0.0625, "grad_norm": 0.19903215653081743, "kl": 2.80859375, "learning_rate": 1.9835351224822164e-05, "loss": 0.1123, "num_tokens": 331040769.0, "reward": 1.884765625, "reward_std": 0.3750731348991394, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.9296875, "rewards/format_reward/std": 0.2561737895011902, "rewards/tag_count_reward/mean": 0.943359375, "rewards/tag_count_reward/std": 0.1648375391960144, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1751.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 836.0, "completions/min_terminated_length": 0.0, "epoch": 0.15242809592899206, "frac_reward_zero_std": 0.3125, "grad_norm": 0.1804086407672663, "kl": 2.98828125, "learning_rate": 1.9834272787747635e-05, "loss": 0.1195, "num_tokens": 331528321.0, "reward": 1.9853515625, "reward_std": 0.23961302638053894, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21178513765335083, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.9736328125, "rewards/tag_count_reward/std": 0.12142167240381241, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1560.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 662.0, "completions/min_terminated_length": 0.0, "epoch": 0.15259878808568747, "frac_reward_zero_std": 0.125, "grad_norm": 0.2121367655937142, "kl": 3.234375, "learning_rate": 1.9833190859861266e-05, "loss": 0.1296, "num_tokens": 331966225.0, "reward": 1.9306640625, "reward_std": 0.3517279326915741, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.93359375, "rewards/format_reward/std": 0.24947863817214966, "rewards/tag_count_reward/mean": 0.9580078125, "rewards/tag_count_reward/std": 0.12888792157173157, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1192.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 639.0, "completions/min_terminated_length": 0.0, "epoch": 0.15276948024238285, "frac_reward_zero_std": 0.375, "grad_norm": 0.33032046722519737, "kl": 3.17578125, "learning_rate": 1.9832105441547094e-05, "loss": 0.1269, "num_tokens": 332314001.0, "reward": 2.0625, "reward_std": 0.24250395596027374, "rewards/accuracy_reward/mean": 0.11249999701976776, "rewards/accuracy_reward/std": 0.3166410028934479, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.08430802822113037, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1289.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 698.0, "completions/min_terminated_length": 0.0, "epoch": 0.15294017239907826, "frac_reward_zero_std": 0.125, "grad_norm": 0.49626689623675385, "kl": 3.11328125, "learning_rate": 1.9831016533190416e-05, "loss": 0.1246, "num_tokens": 332684993.0, "reward": 2.060546875, "reward_std": 0.3335253596305847, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.970703125, "rewards/tag_count_reward/std": 0.12375297397375107, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1547.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 851.0, "completions/min_terminated_length": 0.0, "epoch": 0.15311086455577366, "frac_reward_zero_std": 0.1875, "grad_norm": 6.917978192495025, "kl": 3.046875, "learning_rate": 1.9829924135177748e-05, "loss": 0.1217, "num_tokens": 333120481.0, "reward": 1.8798828125, "reward_std": 0.34155043959617615, "rewards/accuracy_reward/mean": 0.01171875, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 0.92578125, "rewards/format_reward/std": 0.2626400291919708, "rewards/tag_count_reward/mean": 0.9423828125, "rewards/tag_count_reward/std": 0.16224704682826996, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1787.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 881.0, "completions/min_terminated_length": 0.0, "epoch": 0.15328155671246907, "frac_reward_zero_std": 0.125, "grad_norm": 9.52370556624578, "kl": 2.8359375, "learning_rate": 1.982882824789685e-05, "loss": 0.1134, "num_tokens": 333615249.0, "reward": 1.9150390625, "reward_std": 0.2889781594276428, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.94921875, "rewards/format_reward/std": 0.21998079121112823, "rewards/tag_count_reward/mean": 0.9462890625, "rewards/tag_count_reward/std": 0.15273936092853546, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1985.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 1550.0, "completions/min_terminated_length": 0.0, "epoch": 0.15345224886916445, "frac_reward_zero_std": 0.0625, "grad_norm": 9.130978183923647, "kl": 3.07421875, "learning_rate": 1.982772887173672e-05, "loss": 0.1231, "num_tokens": 334161681.0, "reward": 2.0458984375, "reward_std": 0.4104311466217041, "rewards/accuracy_reward/mean": 0.20703125, "rewards/accuracy_reward/std": 0.40597182512283325, "rewards/format_reward/mean": 0.93359375, "rewards/format_reward/std": 0.24947863817214966, "rewards/tag_count_reward/mean": 0.9052734375, "rewards/tag_count_reward/std": 0.19317147135734558, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15362294102585985, "frac_reward_zero_std": 0.0, "grad_norm": 674.7499424688242, "kl": 86.3125, "learning_rate": 1.9826626007087604e-05, "loss": 3.4555, "num_tokens": 334728337.0, "reward": 0.6943359375, "reward_std": 0.5059700012207031, "rewards/accuracy_reward/mean": 0.01953125, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.1015625, "rewards/format_reward/std": 0.3026638329029083, "rewards/tag_count_reward/mean": 0.5732421875, "rewards/tag_count_reward/std": 0.3119465112686157, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15379363318255526, "frac_reward_zero_std": 0.0, "grad_norm": 112.90193041067255, "kl": 18.171875, "learning_rate": 1.9825519654340975e-05, "loss": 0.7284, "num_tokens": 335297329.0, "reward": 0.669921875, "reward_std": 0.2326807826757431, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.638671875, "rewards/tag_count_reward/std": 0.21076098084449768, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15396432533925067, "frac_reward_zero_std": 0.0, "grad_norm": 84.25582318696468, "kl": 14.125, "learning_rate": 1.9824409813889555e-05, "loss": 0.565, "num_tokens": 335864033.0, "reward": 0.6162109375, "reward_std": 0.27746880054473877, "rewards/accuracy_reward/mean": 0.04296875, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5732421875, "rewards/tag_count_reward/std": 0.2539044916629791, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15413501749594608, "frac_reward_zero_std": 0.0, "grad_norm": 34.41194491487563, "kl": 4.6328125, "learning_rate": 1.982329648612729e-05, "loss": 0.1857, "num_tokens": 336423409.0, "reward": 0.4775390625, "reward_std": 0.27819669246673584, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.4658203125, "rewards/tag_count_reward/std": 0.27839475870132446, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1903.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 889.0, "completions/min_terminated_length": 0.0, "epoch": 0.15430570965264145, "frac_reward_zero_std": 0.0625, "grad_norm": 25.012534986450532, "kl": 3.26171875, "learning_rate": 1.982217967144937e-05, "loss": 0.1303, "num_tokens": 336959329.0, "reward": 0.7509765625, "reward_std": 0.20796921849250793, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2561737895011902, "rewards/format_reward/mean": 0.01171875, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.6689453125, "rewards/tag_count_reward/std": 0.18263448774814606, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1802.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 822.0, "completions/min_terminated_length": 0.0, "epoch": 0.15447640180933686, "frac_reward_zero_std": 0.0, "grad_norm": 32.404099165178025, "kl": 3.64453125, "learning_rate": 1.9821059370252227e-05, "loss": 0.1456, "num_tokens": 337465105.0, "reward": 0.65234375, "reward_std": 0.34051424264907837, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.04296875, "rewards/format_reward/std": 0.20318391919136047, "rewards/tag_count_reward/mean": 0.59375, "rewards/tag_count_reward/std": 0.2747547924518585, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15464709396603227, "frac_reward_zero_std": 0.0, "grad_norm": 18.287261685125163, "kl": 2.73828125, "learning_rate": 1.981993558293353e-05, "loss": 0.1094, "num_tokens": 338036593.0, "reward": 0.34765625, "reward_std": 0.28027284145355225, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3046875, "rewards/tag_count_reward/std": 0.2464204579591751, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1976.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 1479.0, "completions/min_terminated_length": 0.0, "epoch": 0.15481778612272767, "frac_reward_zero_std": 0.0, "grad_norm": 10.16337155788673, "kl": 3.75, "learning_rate": 1.981880830989218e-05, "loss": 0.1502, "num_tokens": 338583953.0, "reward": 0.3486328125, "reward_std": 0.2623794674873352, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.2587890625, "rewards/tag_count_reward/std": 0.23881016671657562, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1078.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 190.0, "completions/min_terminated_length": 0.0, "epoch": 0.15498847827942305, "frac_reward_zero_std": 0.0, "grad_norm": 162.97263625474207, "kl": 4.734375, "learning_rate": 1.9817677551528314e-05, "loss": 0.1892, "num_tokens": 338897969.0, "reward": 0.3115234375, "reward_std": 0.1548474133014679, "rewards/accuracy_reward/mean": 0.004166666883975267, "rewards/accuracy_reward/std": 0.06454972177743912, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3076171875, "rewards/tag_count_reward/std": 0.19135889410972595, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1521.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 1088.0, "completions/min_terminated_length": 0.0, "epoch": 0.15515917043611846, "frac_reward_zero_std": 0.0, "grad_norm": 17.11799510292501, "kl": 4.9921875, "learning_rate": 1.9816543308243323e-05, "loss": 0.1997, "num_tokens": 339331569.0, "reward": 0.240234375, "reward_std": 0.18285121023654938, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.232421875, "rewards/tag_count_reward/std": 0.2261866182088852, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1297.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 717.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 19.0, "completions/min_terminated_length": 0.0, "epoch": 0.15532986259281387, "frac_reward_zero_std": 0.125, "grad_norm": 2.003042348261558, "kl": 6.59375, "learning_rate": 1.9815405580439807e-05, "loss": 0.2635, "num_tokens": 339558289.0, "reward": 0.4111328125, "reward_std": 0.1392739713191986, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4111328125, "rewards/tag_count_reward/std": 0.17894870042800903, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 48.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 22.0, "completions/min_terminated_length": 0.0, "epoch": 0.15550055474950927, "frac_reward_zero_std": 0.75, "grad_norm": 2.0901376188532086, "kl": 13.421875, "learning_rate": 1.9814264368521632e-05, "loss": 0.5364, "num_tokens": 339626209.0, "reward": 0.4677734375, "reward_std": 0.01953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4677734375, "rewards/tag_count_reward/std": 0.11999396979808807, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1488.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 35.0, "completions/min_terminated_length": 0.0, "epoch": 0.15567124690620465, "frac_reward_zero_std": 0.25, "grad_norm": 113.51864100893786, "kl": 22.078125, "learning_rate": 1.9813119672893877e-05, "loss": 0.8832, "num_tokens": 340053153.0, "reward": 0.1162109375, "reward_std": 0.08779485523700714, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1123046875, "rewards/tag_count_reward/std": 0.12459723651409149, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15584193906290006, "frac_reward_zero_std": 0.625, "grad_norm": 333.19607302878165, "kl": 25.5625, "learning_rate": 1.9811971493962872e-05, "loss": 1.0245, "num_tokens": 340614897.0, "reward": 0.01171875, "reward_std": 0.03824388235807419, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0078125, "rewards/tag_count_reward/std": 0.04358336701989174, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15601263121959547, "frac_reward_zero_std": 0.6875, "grad_norm": 130.08500379383335, "kl": 19.109375, "learning_rate": 1.9810819832136178e-05, "loss": 0.7647, "num_tokens": 341177809.0, "reward": 0.013671875, "reward_std": 0.04485190659761429, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.005859375, "rewards/tag_count_reward/std": 0.03789619356393814, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1962.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 1520.0, "completions/min_terminated_length": 0.0, "epoch": 0.15618332337629087, "frac_reward_zero_std": 0.5625, "grad_norm": 85.16193467113264, "kl": 11.15625, "learning_rate": 1.9809664687822597e-05, "loss": 0.4466, "num_tokens": 341722625.0, "reward": 0.029296875, "reward_std": 0.08560486882925034, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.005859375, "rewards/tag_count_reward/std": 0.03789619356393814, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 54.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 25.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 13.0, "completions/min_terminated_length": 0.0, "epoch": 0.15635401553298625, "frac_reward_zero_std": 1.0, "grad_norm": 0.27645458915368526, "kl": 15.015625, "learning_rate": 1.9808506061432157e-05, "loss": 0.6002, "num_tokens": 341770113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 65.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 44.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 17.0, "completions/min_terminated_length": 0.0, "epoch": 0.15652470768968166, "frac_reward_zero_std": 1.0, "grad_norm": 0.5614298173077857, "kl": 14.984375, "learning_rate": 1.9807343953376135e-05, "loss": 0.5993, "num_tokens": 341821105.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 179.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 92.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 26.0, "completions/min_terminated_length": 0.0, "epoch": 0.15669539984637706, "frac_reward_zero_std": 1.0, "grad_norm": 0.14691080154434918, "kl": 12.796875, "learning_rate": 1.980617836406703e-05, "loss": 0.5121, "num_tokens": 341886097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 370.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 204.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 108.0, "completions/min_terminated_length": 0.0, "epoch": 0.15686609200307247, "frac_reward_zero_std": 1.0, "grad_norm": 0.19089642429408699, "kl": 8.765625, "learning_rate": 1.9805009293918592e-05, "loss": 0.35, "num_tokens": 341987809.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 402.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 250.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 114.0, "completions/min_terminated_length": 0.0, "epoch": 0.15703678415976785, "frac_reward_zero_std": 0.9375, "grad_norm": 0.44101166033717604, "kl": 7.3515625, "learning_rate": 1.980383674334579e-05, "loss": 0.2943, "num_tokens": 342088193.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 683.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 394.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 233.0, "completions/min_terminated_length": 0.0, "epoch": 0.15720747631646326, "frac_reward_zero_std": 1.0, "grad_norm": 0.20375796358443016, "kl": 6.03125, "learning_rate": 1.980266071276485e-05, "loss": 0.2412, "num_tokens": 342227857.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 647.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 474.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 347.0, "completions/min_terminated_length": 0.0, "epoch": 0.15737816847315866, "frac_reward_zero_std": 0.9375, "grad_norm": 0.3669099963340817, "kl": 5.6484375, "learning_rate": 1.9801481202593206e-05, "loss": 0.2259, "num_tokens": 342395185.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 684.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 526.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 333.0, "completions/min_terminated_length": 0.0, "epoch": 0.15754886062985407, "frac_reward_zero_std": 0.75, "grad_norm": 3.0273418062343205, "kl": 4.8125, "learning_rate": 1.9800298213249552e-05, "loss": 0.1924, "num_tokens": 342572433.0, "reward": 0.025390625, "reward_std": 0.057111162692308426, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.001953125, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 614.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 460.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 344.0, "completions/min_terminated_length": 0.0, "epoch": 0.15771955278654945, "frac_reward_zero_std": 0.875, "grad_norm": 4.573606405487499, "kl": 4.14453125, "learning_rate": 1.97991117451538e-05, "loss": 0.166, "num_tokens": 342730161.0, "reward": 0.005859375, "reward_std": 0.020961953327059746, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.001953125, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 798.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 477.0, "completions/min_terminated_length": 0.0, "epoch": 0.15789024494324486, "frac_reward_zero_std": 0.5625, "grad_norm": 7.200320012224596, "kl": 3.0078125, "learning_rate": 1.9797921798727112e-05, "loss": 0.1202, "num_tokens": 342992929.0, "reward": 0.0224609375, "reward_std": 0.056527793407440186, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0146484375, "rewards/tag_count_reward/std": 0.058830711990594864, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1332.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1111.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 839.0, "completions/min_terminated_length": 0.0, "epoch": 0.15806093709994026, "frac_reward_zero_std": 0.0, "grad_norm": 3.674565071320908, "kl": 2.640625, "learning_rate": 1.9796728374391866e-05, "loss": 0.1056, "num_tokens": 343321409.0, "reward": 0.13671875, "reward_std": 0.172566220164299, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.11328125, "rewards/tag_count_reward/std": 0.14970354735851288, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1311.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1019.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 847.0, "completions/min_terminated_length": 0.0, "epoch": 0.15823162925663567, "frac_reward_zero_std": 0.0, "grad_norm": 6.825936722919553, "kl": 2.7421875, "learning_rate": 1.979553147257169e-05, "loss": 0.1097, "num_tokens": 343623121.0, "reward": 0.2939453125, "reward_std": 0.22294366359710693, "rewards/accuracy_reward/mean": 0.03515625, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2587890625, "rewards/tag_count_reward/std": 0.18033438920974731, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1732.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1139.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 825.0, "completions/min_terminated_length": 0.0, "epoch": 0.15840232141333105, "frac_reward_zero_std": 0.0, "grad_norm": 8.753846147650687, "kl": 3.59765625, "learning_rate": 1.979433109369144e-05, "loss": 0.1437, "num_tokens": 343953521.0, "reward": 0.3115234375, "reward_std": 0.17735090851783752, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3115234375, "rewards/tag_count_reward/std": 0.18223564326763153, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1299.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 954.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 541.0, "completions/min_terminated_length": 0.0, "epoch": 0.15857301357002646, "frac_reward_zero_std": 0.0, "grad_norm": 53.95090152925237, "kl": 9.6640625, "learning_rate": 1.97931272381772e-05, "loss": 0.3866, "num_tokens": 344238241.0, "reward": 0.3095703125, "reward_std": 0.1452869474887848, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3017578125, "rewards/tag_count_reward/std": 0.14350935816764832, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1749.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 1372.0, "completions/min_terminated_length": 0.0, "epoch": 0.15874370572672186, "frac_reward_zero_std": 0.0625, "grad_norm": 39.89640190173612, "kl": 4.7890625, "learning_rate": 1.9791919906456308e-05, "loss": 0.1919, "num_tokens": 344725249.0, "reward": 0.1962890625, "reward_std": 0.10035886615514755, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1962890625, "rewards/tag_count_reward/std": 0.10979436337947845, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 54.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 20.0, "completions/min_terminated_length": 0.0, "epoch": 0.15891439788341727, "frac_reward_zero_std": 0.8125, "grad_norm": 2.8553127787585906, "kl": 13.546875, "learning_rate": 1.9790709098957316e-05, "loss": 0.5408, "num_tokens": 344773361.0, "reward": 0.24609375, "reward_std": 0.013149453327059746, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.24609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1157.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 389.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 214.0, "completions/min_terminated_length": 0.0, "epoch": 0.15908509004011265, "frac_reward_zero_std": 0.0, "grad_norm": 66.23898085742624, "kl": 6.109375, "learning_rate": 1.978949481611002e-05, "loss": 0.2445, "num_tokens": 344914753.0, "reward": 0.1240234375, "reward_std": 0.1534115970134735, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1240234375, "rewards/tag_count_reward/std": 0.1565530151128769, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15925578219680805, "frac_reward_zero_std": 0.125, "grad_norm": 2.6810660177135324, "kl": 0.62255859375, "learning_rate": 1.978827705834544e-05, "loss": 0.0249, "num_tokens": 345481713.0, "reward": 0.0380859375, "reward_std": 0.08628939092159271, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0380859375, "rewards/tag_count_reward/std": 0.0953047126531601, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15942647435350346, "frac_reward_zero_std": 0.25, "grad_norm": 1.6776520633229561, "kl": 0.2822265625, "learning_rate": 1.9787055826095847e-05, "loss": 0.0113, "num_tokens": 346046417.0, "reward": 0.021484375, "reward_std": 0.058760739862918854, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.021484375, "rewards/tag_count_reward/std": 0.0702051892876625, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15959716651019887, "frac_reward_zero_std": 0.125, "grad_norm": 6.927276796546991, "kl": 0.39501953125, "learning_rate": 1.9785831119794726e-05, "loss": 0.0158, "num_tokens": 346606465.0, "reward": 0.0732421875, "reward_std": 0.10077184438705444, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0732421875, "rewards/tag_count_reward/std": 0.11400394141674042, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15976785866689425, "frac_reward_zero_std": 1.0, "grad_norm": 2.7328682167246336, "kl": 0.177978515625, "learning_rate": 1.9784602939876804e-05, "loss": 0.0071, "num_tokens": 347170801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15993855082358965, "frac_reward_zero_std": 1.0, "grad_norm": 0.5165897158094516, "kl": 0.2880859375, "learning_rate": 1.9783371286778043e-05, "loss": 0.0115, "num_tokens": 347734289.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16010924298028506, "frac_reward_zero_std": 1.0, "grad_norm": 0.00041136947348897966, "kl": 0.025146484375, "learning_rate": 1.9782136160935637e-05, "loss": 0.001, "num_tokens": 348300129.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16027993513698047, "frac_reward_zero_std": 1.0, "grad_norm": 7.959582385501863e-05, "kl": 0.026702880859375, "learning_rate": 1.9780897562788003e-05, "loss": 0.0011, "num_tokens": 348867377.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16045062729367585, "frac_reward_zero_std": 1.0, "grad_norm": 1.3140606112268427e-07, "kl": 0.0245361328125, "learning_rate": 1.977965549277481e-05, "loss": 0.001, "num_tokens": 349435057.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16062131945037125, "frac_reward_zero_std": 1.0, "grad_norm": 9.249766576860692e-08, "kl": 0.024444580078125, "learning_rate": 1.9778409951336938e-05, "loss": 0.001, "num_tokens": 349997681.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16079201160706666, "frac_reward_zero_std": 1.0, "grad_norm": 7.715097399464743e-08, "kl": 0.022857666015625, "learning_rate": 1.9777160938916518e-05, "loss": 0.0009, "num_tokens": 350558849.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16096270376376207, "frac_reward_zero_std": 1.0, "grad_norm": 6.698879544540021e-08, "kl": 0.024383544921875, "learning_rate": 1.9775908455956897e-05, "loss": 0.001, "num_tokens": 351115841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16113339592045745, "frac_reward_zero_std": 1.0, "grad_norm": 3.8528050900648983e-08, "kl": 0.02471923828125, "learning_rate": 1.9774652502902666e-05, "loss": 0.001, "num_tokens": 351678769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16130408807715285, "frac_reward_zero_std": 1.0, "grad_norm": 2.8620176083775755e-08, "kl": 0.024261474609375, "learning_rate": 1.977339308019964e-05, "loss": 0.001, "num_tokens": 352243617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16147478023384826, "frac_reward_zero_std": 1.0, "grad_norm": 2.2360419763697797e-08, "kl": 0.024658203125, "learning_rate": 1.977213018829487e-05, "loss": 0.001, "num_tokens": 352811825.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16164547239054367, "frac_reward_zero_std": 1.0, "grad_norm": 1.9240419158578246e-08, "kl": 0.026214599609375, "learning_rate": 1.9770863827636634e-05, "loss": 0.001, "num_tokens": 353372161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16181616454723904, "frac_reward_zero_std": 1.0, "grad_norm": 1.6093814007570927e-07, "kl": 0.025543212890625, "learning_rate": 1.9769593998674453e-05, "loss": 0.001, "num_tokens": 353934737.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16198685670393445, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012368571960891112, "kl": 0.025360107421875, "learning_rate": 1.9768320701859062e-05, "loss": 0.001, "num_tokens": 354497329.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16215754886062986, "frac_reward_zero_std": 1.0, "grad_norm": 1534662478.9956062, "kl": 15990784.012268066, "learning_rate": 1.9767043937642437e-05, "loss": 641152.25, "num_tokens": 355065537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 458.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 85.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 1.0, "completions/min_terminated_length": 0.0, "epoch": 0.16232824101732526, "frac_reward_zero_std": 1.0, "grad_norm": 2.973603674432105, "kl": 17.75, "learning_rate": 1.9765763706477782e-05, "loss": 0.7099, "num_tokens": 355134097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 61.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 36.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 27.0, "completions/min_terminated_length": 0.0, "epoch": 0.16249893317402064, "frac_reward_zero_std": 1.0, "grad_norm": 5.828513012664599, "kl": 15.796875, "learning_rate": 1.9764480008819536e-05, "loss": 0.6326, "num_tokens": 355179761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2026.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1872.0, "completions/min_terminated_length": 0.0, "epoch": 0.16266962533071605, "frac_reward_zero_std": 1.0, "grad_norm": 1.8131180852927862, "kl": 2.9765625, "learning_rate": 1.9763192845123368e-05, "loss": 0.1193, "num_tokens": 355739345.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1552.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1097.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 772.0, "completions/min_terminated_length": 0.0, "epoch": 0.16284031748741146, "frac_reward_zero_std": 1.0, "grad_norm": 41.28194239611481, "kl": 8.8359375, "learning_rate": 1.976190221584617e-05, "loss": 0.3537, "num_tokens": 356062417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1755.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1468.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 1012.0, "completions/min_terminated_length": 0.0, "epoch": 0.16301100964410686, "frac_reward_zero_std": 1.0, "grad_norm": 40.57900008636055, "kl": 9.1875, "learning_rate": 1.9760608121446066e-05, "loss": 0.3683, "num_tokens": 356481633.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1568.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 897.0, "completions/min_terminated_length": 0.0, "epoch": 0.16318170180080224, "frac_reward_zero_std": 1.0, "grad_norm": 8.839319531487371, "kl": 3.67578125, "learning_rate": 1.9759310562382423e-05, "loss": 0.1471, "num_tokens": 356922689.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16335239395749765, "frac_reward_zero_std": 1.0, "grad_norm": 0.949330111369109, "kl": 1.373046875, "learning_rate": 1.9758009539115814e-05, "loss": 0.0549, "num_tokens": 357487249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16352308611419306, "frac_reward_zero_std": 1.0, "grad_norm": 0.3335542780104391, "kl": 0.1197509765625, "learning_rate": 1.9756705052108068e-05, "loss": 0.0048, "num_tokens": 358050353.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16369377827088846, "frac_reward_zero_std": 1.0, "grad_norm": 0.016518873850849387, "kl": 0.03070068359375, "learning_rate": 1.9755397101822223e-05, "loss": 0.0012, "num_tokens": 358620465.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16386447042758384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010852471783049029, "kl": 0.01812744140625, "learning_rate": 1.9754085688722554e-05, "loss": 0.0007, "num_tokens": 359182481.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16403516258427925, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011690424226734515, "kl": 0.01788330078125, "learning_rate": 1.9752770813274574e-05, "loss": 0.0007, "num_tokens": 359747281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16420585474097465, "frac_reward_zero_std": 1.0, "grad_norm": 0.06773190665030802, "kl": 0.02203369140625, "learning_rate": 1.9751452475945005e-05, "loss": 0.0009, "num_tokens": 360311121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16437654689767006, "frac_reward_zero_std": 1.0, "grad_norm": 1.5791478115415517e-08, "kl": 0.017669677734375, "learning_rate": 1.9750130677201816e-05, "loss": 0.0007, "num_tokens": 360882465.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16454723905436544, "frac_reward_zero_std": 1.0, "grad_norm": 3.4351509028255397e-09, "kl": 0.017364501953125, "learning_rate": 1.9748805417514195e-05, "loss": 0.0007, "num_tokens": 361449297.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16471793121106085, "frac_reward_zero_std": 1.0, "grad_norm": 3.0956881651033733e-09, "kl": 0.017242431640625, "learning_rate": 1.9747476697352567e-05, "loss": 0.0007, "num_tokens": 362011425.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16488862336775625, "frac_reward_zero_std": 1.0, "grad_norm": 1.6457306222060511e-09, "kl": 0.01776123046875, "learning_rate": 1.974614451718857e-05, "loss": 0.0007, "num_tokens": 362576001.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16505931552445166, "frac_reward_zero_std": 1.0, "grad_norm": 1.0185993316053844e-09, "kl": 0.017974853515625, "learning_rate": 1.9744808877495084e-05, "loss": 0.0007, "num_tokens": 363137153.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16523000768114704, "frac_reward_zero_std": 1.0, "grad_norm": 6.709542418476447e-10, "kl": 0.017486572265625, "learning_rate": 1.9743469778746222e-05, "loss": 0.0007, "num_tokens": 363710417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16540069983784245, "frac_reward_zero_std": 1.0, "grad_norm": 4.829159341776059e-10, "kl": 0.01800537109375, "learning_rate": 1.9742127221417298e-05, "loss": 0.0007, "num_tokens": 364282033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16557139199453785, "frac_reward_zero_std": 1.0, "grad_norm": 3.364016093908487e-10, "kl": 0.01641845703125, "learning_rate": 1.9740781205984888e-05, "loss": 0.0007, "num_tokens": 364855825.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16574208415123326, "frac_reward_zero_std": 1.0, "grad_norm": 2.706593243182733e-10, "kl": 0.017608642578125, "learning_rate": 1.9739431732926765e-05, "loss": 0.0007, "num_tokens": 365422657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16591277630792864, "frac_reward_zero_std": 1.0, "grad_norm": 6.595425967287684e-10, "kl": 0.01763916015625, "learning_rate": 1.9738078802721957e-05, "loss": 0.0007, "num_tokens": 365991361.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16608346846462405, "frac_reward_zero_std": 1.0, "grad_norm": 1.1635139931817252e-10, "kl": 0.017242431640625, "learning_rate": 1.9736722415850694e-05, "loss": 0.0007, "num_tokens": 366558513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16625416062131945, "frac_reward_zero_std": 1.0, "grad_norm": 2.0181989408511237e-10, "kl": 0.017242431640625, "learning_rate": 1.9735362572794454e-05, "loss": 0.0007, "num_tokens": 367122289.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16642485277801486, "frac_reward_zero_std": 1.0, "grad_norm": 3.1199191388972274e-10, "kl": 0.017913818359375, "learning_rate": 1.973399927403592e-05, "loss": 0.0007, "num_tokens": 367687921.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16659554493471024, "frac_reward_zero_std": 1.0, "grad_norm": 2.6172995904035764e-10, "kl": 0.01739501953125, "learning_rate": 1.9732632520059025e-05, "loss": 0.0007, "num_tokens": 368250865.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16676623709140564, "frac_reward_zero_std": 1.0, "grad_norm": 4.4823552300353865e-10, "kl": 0.016998291015625, "learning_rate": 1.9731262311348913e-05, "loss": 0.0007, "num_tokens": 368822065.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16693692924810105, "frac_reward_zero_std": 1.0, "grad_norm": 4.0334906935428974e-10, "kl": 0.0169677734375, "learning_rate": 1.972988864839196e-05, "loss": 0.0007, "num_tokens": 369390129.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16710762140479646, "frac_reward_zero_std": 1.0, "grad_norm": 7.480870022694558e-10, "kl": 0.017120361328125, "learning_rate": 1.9728511531675763e-05, "loss": 0.0007, "num_tokens": 369957313.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16727831356149184, "frac_reward_zero_std": 1.0, "grad_norm": 9.50629044244028e-10, "kl": 0.017608642578125, "learning_rate": 1.9727130961689155e-05, "loss": 0.0007, "num_tokens": 370522657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16744900571818724, "frac_reward_zero_std": 1.0, "grad_norm": 0.00038539661940277, "kl": 0.01690673828125, "learning_rate": 1.972574693892218e-05, "loss": 0.0007, "num_tokens": 371088673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16761969787488265, "frac_reward_zero_std": 1.0, "grad_norm": 7.322753602324859e-07, "kl": 0.017364501953125, "learning_rate": 1.9724359463866123e-05, "loss": 0.0007, "num_tokens": 371651057.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16779039003157806, "frac_reward_zero_std": 1.0, "grad_norm": 2.0822239218219476e-09, "kl": 0.017425537109375, "learning_rate": 1.9722968537013484e-05, "loss": 0.0007, "num_tokens": 372214657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16796108218827344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003310112372084264, "kl": 0.017059326171875, "learning_rate": 1.9721574158858e-05, "loss": 0.0007, "num_tokens": 372780193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16813177434496884, "frac_reward_zero_std": 1.0, "grad_norm": 9.070562551945146e-10, "kl": 0.01776123046875, "learning_rate": 1.9720176329894612e-05, "loss": 0.0007, "num_tokens": 373343937.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16830246650166425, "frac_reward_zero_std": 1.0, "grad_norm": 1.4213337584941892e-06, "kl": 0.01788330078125, "learning_rate": 1.971877505061951e-05, "loss": 0.0007, "num_tokens": 373909393.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16847315865835966, "frac_reward_zero_std": 1.0, "grad_norm": 0.001440106827797823, "kl": 0.017669677734375, "learning_rate": 1.9717370321530085e-05, "loss": 0.0007, "num_tokens": 374472497.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16864385081505504, "frac_reward_zero_std": 1.0, "grad_norm": 0.08546521174565612, "kl": 0.017486572265625, "learning_rate": 1.9715962143124975e-05, "loss": 0.0007, "num_tokens": 375032273.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16881454297175044, "frac_reward_zero_std": 1.0, "grad_norm": 5.011056660903193e-11, "kl": 0.017608642578125, "learning_rate": 1.9714550515904033e-05, "loss": 0.0007, "num_tokens": 375592865.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16898523512844585, "frac_reward_zero_std": 1.0, "grad_norm": 3.888824924037522e-11, "kl": 0.0172119140625, "learning_rate": 1.9713135440368335e-05, "loss": 0.0007, "num_tokens": 376167601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16915592728514126, "frac_reward_zero_std": 1.0, "grad_norm": 1.203324008533106e-10, "kl": 0.017333984375, "learning_rate": 1.9711716917020176e-05, "loss": 0.0007, "num_tokens": 376732145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16932661944183663, "frac_reward_zero_std": 1.0, "grad_norm": 4.516734611510574e-11, "kl": 0.01788330078125, "learning_rate": 1.9710294946363085e-05, "loss": 0.0007, "num_tokens": 377292625.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16949731159853204, "frac_reward_zero_std": 1.0, "grad_norm": 4.482427766872266e-11, "kl": 0.017120361328125, "learning_rate": 1.970886952890181e-05, "loss": 0.0007, "num_tokens": 377856385.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16966800375522745, "frac_reward_zero_std": 1.0, "grad_norm": 4.50235455850579e-11, "kl": 0.01800537109375, "learning_rate": 1.9707440665142322e-05, "loss": 0.0007, "num_tokens": 378423537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.16983869591192285, "frac_reward_zero_std": 1.0, "grad_norm": 4.1915290232675655e-11, "kl": 0.01788330078125, "learning_rate": 1.9706008355591817e-05, "loss": 0.0007, "num_tokens": 378993521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17000938806861823, "frac_reward_zero_std": 1.0, "grad_norm": 4.255891210193837e-11, "kl": 0.01708984375, "learning_rate": 1.9704572600758712e-05, "loss": 0.0007, "num_tokens": 379558289.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17018008022531364, "frac_reward_zero_std": 1.0, "grad_norm": 4.0384485632448614e-11, "kl": 0.017120361328125, "learning_rate": 1.970313340115265e-05, "loss": 0.0007, "num_tokens": 380124865.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17035077238200905, "frac_reward_zero_std": 1.0, "grad_norm": 4.051623900738144e-11, "kl": 0.018035888671875, "learning_rate": 1.970169075728449e-05, "loss": 0.0007, "num_tokens": 380688385.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17052146453870445, "frac_reward_zero_std": 1.0, "grad_norm": 3.7556189344890935e-11, "kl": 0.017181396484375, "learning_rate": 1.9700244669666327e-05, "loss": 0.0007, "num_tokens": 381250577.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17069215669539983, "frac_reward_zero_std": 1.0, "grad_norm": 3.785736793871436e-11, "kl": 0.017852783203125, "learning_rate": 1.9698795138811463e-05, "loss": 0.0007, "num_tokens": 381817185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17086284885209524, "frac_reward_zero_std": 1.0, "grad_norm": 4.038085111988934e-11, "kl": 0.01739501953125, "learning_rate": 1.969734216523443e-05, "loss": 0.0007, "num_tokens": 382382817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17103354100879065, "frac_reward_zero_std": 1.0, "grad_norm": 3.830049653784223e-11, "kl": 0.01678466796875, "learning_rate": 1.9695885749450982e-05, "loss": 0.0007, "num_tokens": 382951265.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17120423316548605, "frac_reward_zero_std": 1.0, "grad_norm": 3.9227023151134063e-11, "kl": 0.0169677734375, "learning_rate": 1.96944258919781e-05, "loss": 0.0007, "num_tokens": 383511281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17137492532218146, "frac_reward_zero_std": 1.0, "grad_norm": 3.770954479716235e-11, "kl": 0.016998291015625, "learning_rate": 1.9692962593333968e-05, "loss": 0.0007, "num_tokens": 384079025.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17154561747887684, "frac_reward_zero_std": 1.0, "grad_norm": 4.128397555966063e-11, "kl": 0.01800537109375, "learning_rate": 1.969149585403801e-05, "loss": 0.0007, "num_tokens": 384646625.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17171630963557225, "frac_reward_zero_std": 1.0, "grad_norm": 3.9416535360012236e-11, "kl": 0.017120361328125, "learning_rate": 1.9690025674610874e-05, "loss": 0.0007, "num_tokens": 385216801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17188700179226765, "frac_reward_zero_std": 1.0, "grad_norm": 4.181372835008846e-11, "kl": 0.017425537109375, "learning_rate": 1.968855205557441e-05, "loss": 0.0007, "num_tokens": 385787041.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17205769394896306, "frac_reward_zero_std": 1.0, "grad_norm": 4.4244617759401805e-11, "kl": 0.01763916015625, "learning_rate": 1.96870749974517e-05, "loss": 0.0007, "num_tokens": 386349393.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17222838610565844, "frac_reward_zero_std": 1.0, "grad_norm": 4.252944064355211e-11, "kl": 0.017608642578125, "learning_rate": 1.9685594500767054e-05, "loss": 0.0007, "num_tokens": 386916161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17239907826235384, "frac_reward_zero_std": 1.0, "grad_norm": 4.004627820387125e-11, "kl": 0.017303466796875, "learning_rate": 1.9684110566045987e-05, "loss": 0.0007, "num_tokens": 387477729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17256977041904925, "frac_reward_zero_std": 1.0, "grad_norm": 4.231550918731662e-11, "kl": 0.01715087890625, "learning_rate": 1.9682623193815243e-05, "loss": 0.0007, "num_tokens": 388044977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17274046257574466, "frac_reward_zero_std": 1.0, "grad_norm": 4.4349681044681826e-11, "kl": 0.01800537109375, "learning_rate": 1.9681132384602795e-05, "loss": 0.0007, "num_tokens": 388610561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17291115473244004, "frac_reward_zero_std": 1.0, "grad_norm": 4.40712454928661e-11, "kl": 0.017425537109375, "learning_rate": 1.9679638138937813e-05, "loss": 0.0007, "num_tokens": 389180865.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17308184688913544, "frac_reward_zero_std": 1.0, "grad_norm": 4.162871593887704e-11, "kl": 0.0174560546875, "learning_rate": 1.967814045735071e-05, "loss": 0.0007, "num_tokens": 389745201.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17325253904583085, "frac_reward_zero_std": 1.0, "grad_norm": 4.2460879505335375e-11, "kl": 0.0172119140625, "learning_rate": 1.9676639340373103e-05, "loss": 0.0007, "num_tokens": 390310401.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17342323120252626, "frac_reward_zero_std": 1.0, "grad_norm": 3.9843065244815075e-11, "kl": 0.01666259765625, "learning_rate": 1.9675134788537838e-05, "loss": 0.0007, "num_tokens": 390881297.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17359392335922164, "frac_reward_zero_std": 1.0, "grad_norm": 3.92075780703876e-11, "kl": 0.016845703125, "learning_rate": 1.9673626802378978e-05, "loss": 0.0007, "num_tokens": 391451569.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17376461551591704, "frac_reward_zero_std": 1.0, "grad_norm": 4.1564743048588816e-11, "kl": 0.017364501953125, "learning_rate": 1.9672115382431796e-05, "loss": 0.0007, "num_tokens": 392023393.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17393530767261245, "frac_reward_zero_std": 1.0, "grad_norm": 4.0357551031779935e-11, "kl": 0.01702880859375, "learning_rate": 1.96706005292328e-05, "loss": 0.0007, "num_tokens": 392589745.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17410599982930786, "frac_reward_zero_std": 1.0, "grad_norm": 3.9796753758440075e-11, "kl": 0.016876220703125, "learning_rate": 1.9669082243319703e-05, "loss": 0.0007, "num_tokens": 393151121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17427669198600323, "frac_reward_zero_std": 1.0, "grad_norm": 4.170131087151616e-11, "kl": 0.017791748046875, "learning_rate": 1.9667560525231444e-05, "loss": 0.0007, "num_tokens": 393716721.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17444738414269864, "frac_reward_zero_std": 1.0, "grad_norm": 4.030773457075833e-11, "kl": 0.01678466796875, "learning_rate": 1.9666035375508178e-05, "loss": 0.0007, "num_tokens": 394284625.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17461807629939405, "frac_reward_zero_std": 1.0, "grad_norm": 4.486356334514672e-11, "kl": 0.01690673828125, "learning_rate": 1.966450679469128e-05, "loss": 0.0007, "num_tokens": 394847361.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17478876845608946, "frac_reward_zero_std": 1.0, "grad_norm": 3.796859627471812e-11, "kl": 0.016693115234375, "learning_rate": 1.9662974783323334e-05, "loss": 0.0007, "num_tokens": 395414673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17495946061278483, "frac_reward_zero_std": 1.0, "grad_norm": 4.405734436580778e-11, "kl": 0.01812744140625, "learning_rate": 1.9661439341948154e-05, "loss": 0.0007, "num_tokens": 395980673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17513015276948024, "frac_reward_zero_std": 1.0, "grad_norm": 4.148677631102116e-11, "kl": 0.0167236328125, "learning_rate": 1.9659900471110765e-05, "loss": 0.0007, "num_tokens": 396553217.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17530084492617565, "frac_reward_zero_std": 1.0, "grad_norm": 4.528184293733827e-11, "kl": 0.01776123046875, "learning_rate": 1.9658358171357418e-05, "loss": 0.0007, "num_tokens": 397120593.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17547153708287105, "frac_reward_zero_std": 1.0, "grad_norm": 4.231830300572789e-11, "kl": 0.0174560546875, "learning_rate": 1.9656812443235557e-05, "loss": 0.0007, "num_tokens": 397686417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17564222923956643, "frac_reward_zero_std": 1.0, "grad_norm": 4.3499571331137e-11, "kl": 0.017364501953125, "learning_rate": 1.9655263287293874e-05, "loss": 0.0007, "num_tokens": 398263041.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17581292139626184, "frac_reward_zero_std": 1.0, "grad_norm": 4.1238298792603045e-11, "kl": 0.017333984375, "learning_rate": 1.965371070408226e-05, "loss": 0.0007, "num_tokens": 398824433.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17598361355295725, "frac_reward_zero_std": 1.0, "grad_norm": 4.02623408190443e-11, "kl": 0.0166015625, "learning_rate": 1.9652154694151827e-05, "loss": 0.0007, "num_tokens": 399392273.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17615430570965265, "frac_reward_zero_std": 1.0, "grad_norm": 4.081751558221264e-11, "kl": 0.017608642578125, "learning_rate": 1.9650595258054897e-05, "loss": 0.0007, "num_tokens": 399962417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17632499786634803, "frac_reward_zero_std": 1.0, "grad_norm": 4.513234828370064e-11, "kl": 0.017669677734375, "learning_rate": 1.9649032396345017e-05, "loss": 0.0007, "num_tokens": 400531601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17649569002304344, "frac_reward_zero_std": 1.0, "grad_norm": 4.239728942171932e-11, "kl": 0.017425537109375, "learning_rate": 1.9647466109576947e-05, "loss": 0.0007, "num_tokens": 401093921.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17666638217973885, "frac_reward_zero_std": 1.0, "grad_norm": 4.2183952916382655e-11, "kl": 0.017059326171875, "learning_rate": 1.9645896398306662e-05, "loss": 0.0007, "num_tokens": 401668961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17683707433643425, "frac_reward_zero_std": 1.0, "grad_norm": 4.0876892807516605e-11, "kl": 0.017669677734375, "learning_rate": 1.9644323263091348e-05, "loss": 0.0007, "num_tokens": 402234833.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17700776649312963, "frac_reward_zero_std": 1.0, "grad_norm": 4.08620952908552e-11, "kl": 0.017303466796875, "learning_rate": 1.964274670448942e-05, "loss": 0.0007, "num_tokens": 402805665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17717845864982504, "frac_reward_zero_std": 1.0, "grad_norm": 3.9344578893660843e-11, "kl": 0.01739501953125, "learning_rate": 1.9641166723060495e-05, "loss": 0.0007, "num_tokens": 403370977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17734915080652044, "frac_reward_zero_std": 1.0, "grad_norm": 3.8884361815252936e-11, "kl": 0.016632080078125, "learning_rate": 1.963958331936541e-05, "loss": 0.0007, "num_tokens": 403939249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17751984296321585, "frac_reward_zero_std": 1.0, "grad_norm": 4.132536839533854e-11, "kl": 0.018341064453125, "learning_rate": 1.9637996493966213e-05, "loss": 0.0007, "num_tokens": 404504529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17769053511991123, "frac_reward_zero_std": 1.0, "grad_norm": 4.1632210662535487e-11, "kl": 0.017913818359375, "learning_rate": 1.9636406247426178e-05, "loss": 0.0007, "num_tokens": 405078001.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17786122727660664, "frac_reward_zero_std": 1.0, "grad_norm": 4.0703459178802546e-11, "kl": 0.017242431640625, "learning_rate": 1.9634812580309773e-05, "loss": 0.0007, "num_tokens": 405641937.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17803191943330204, "frac_reward_zero_std": 1.0, "grad_norm": 4.2791182525842986e-11, "kl": 0.017730712890625, "learning_rate": 1.9633215493182702e-05, "loss": 0.0007, "num_tokens": 406210625.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17820261158999745, "frac_reward_zero_std": 1.0, "grad_norm": 4.196981373079072e-11, "kl": 0.017547607421875, "learning_rate": 1.963161498661187e-05, "loss": 0.0007, "num_tokens": 406773585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17837330374669283, "frac_reward_zero_std": 1.0, "grad_norm": 4.336100562942804e-11, "kl": 0.01708984375, "learning_rate": 1.9630011061165398e-05, "loss": 0.0007, "num_tokens": 407339377.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17854399590338824, "frac_reward_zero_std": 1.0, "grad_norm": 4.183507942717987e-11, "kl": 0.01739501953125, "learning_rate": 1.9628403717412622e-05, "loss": 0.0007, "num_tokens": 407901681.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17871468806008364, "frac_reward_zero_std": 1.0, "grad_norm": 4.391430105478966e-11, "kl": 0.017364501953125, "learning_rate": 1.9626792955924096e-05, "loss": 0.0007, "num_tokens": 408469809.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17888538021677905, "frac_reward_zero_std": 1.0, "grad_norm": 4.1928662941547296e-11, "kl": 0.01702880859375, "learning_rate": 1.962517877727157e-05, "loss": 0.0007, "num_tokens": 409030481.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17905607237347443, "frac_reward_zero_std": 1.0, "grad_norm": 4.104436365180889e-11, "kl": 0.016815185546875, "learning_rate": 1.9623561182028034e-05, "loss": 0.0007, "num_tokens": 409595505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17922676453016984, "frac_reward_zero_std": 1.0, "grad_norm": 4.079865994893407e-11, "kl": 0.01727294921875, "learning_rate": 1.9621940170767665e-05, "loss": 0.0007, "num_tokens": 410165809.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17939745668686524, "frac_reward_zero_std": 1.0, "grad_norm": 4.22401630589307e-11, "kl": 0.01776123046875, "learning_rate": 1.962031574406587e-05, "loss": 0.0007, "num_tokens": 410732977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17956814884356065, "frac_reward_zero_std": 1.0, "grad_norm": 4.194239340516068e-11, "kl": 0.017730712890625, "learning_rate": 1.9618687902499257e-05, "loss": 0.0007, "num_tokens": 411298737.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17973884100025603, "frac_reward_zero_std": 1.0, "grad_norm": 4.3035239779653515e-11, "kl": 0.01702880859375, "learning_rate": 1.9617056646645655e-05, "loss": 0.0007, "num_tokens": 411879249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.17990953315695143, "frac_reward_zero_std": 1.0, "grad_norm": 4.386993800231483e-11, "kl": 0.017120361328125, "learning_rate": 1.96154219770841e-05, "loss": 0.0007, "num_tokens": 412443121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18008022531364684, "frac_reward_zero_std": 1.0, "grad_norm": 4.1565830165569294e-11, "kl": 0.01739501953125, "learning_rate": 1.961378389439484e-05, "loss": 0.0007, "num_tokens": 413005761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18025091747034225, "frac_reward_zero_std": 1.0, "grad_norm": 4.2185134914607756e-11, "kl": 0.017303466796875, "learning_rate": 1.9612142399159336e-05, "loss": 0.0007, "num_tokens": 413568705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18042160962703763, "frac_reward_zero_std": 1.0, "grad_norm": 4.074503109473594e-11, "kl": 0.01812744140625, "learning_rate": 1.9610497491960256e-05, "loss": 0.0007, "num_tokens": 414133681.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18059230178373303, "frac_reward_zero_std": 1.0, "grad_norm": 4.360508511226729e-11, "kl": 0.01763916015625, "learning_rate": 1.9608849173381483e-05, "loss": 0.0007, "num_tokens": 414695377.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18076299394042844, "frac_reward_zero_std": 1.0, "grad_norm": 4.3729535410819274e-11, "kl": 0.01690673828125, "learning_rate": 1.9607197444008118e-05, "loss": 0.0007, "num_tokens": 415263201.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18093368609712385, "frac_reward_zero_std": 1.0, "grad_norm": 4.344656918468638e-11, "kl": 0.01806640625, "learning_rate": 1.9605542304426456e-05, "loss": 0.0007, "num_tokens": 415828689.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18110437825381923, "frac_reward_zero_std": 1.0, "grad_norm": 4.186942228886349e-11, "kl": 0.01739501953125, "learning_rate": 1.9603883755224023e-05, "loss": 0.0007, "num_tokens": 416397185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18127507041051463, "frac_reward_zero_std": 1.0, "grad_norm": 4.346042016236681e-11, "kl": 0.01812744140625, "learning_rate": 1.960222179698953e-05, "loss": 0.0007, "num_tokens": 416963473.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18144576256721004, "frac_reward_zero_std": 1.0, "grad_norm": 4.2539255905800154e-11, "kl": 0.017547607421875, "learning_rate": 1.9600556430312923e-05, "loss": 0.0007, "num_tokens": 417533905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18161645472390545, "frac_reward_zero_std": 1.0, "grad_norm": 4.1448136452253077e-11, "kl": 0.017242431640625, "learning_rate": 1.9598887655785343e-05, "loss": 0.0007, "num_tokens": 418101681.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18178714688060083, "frac_reward_zero_std": 1.0, "grad_norm": 4.189413151152433e-11, "kl": 0.01739501953125, "learning_rate": 1.9597215473999146e-05, "loss": 0.0007, "num_tokens": 418666721.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18195783903729623, "frac_reward_zero_std": 1.0, "grad_norm": 4.3808593448900137e-11, "kl": 0.0174560546875, "learning_rate": 1.9595539885547894e-05, "loss": 0.0007, "num_tokens": 419230801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18212853119399164, "frac_reward_zero_std": 1.0, "grad_norm": 4.2350706135982626e-11, "kl": 0.017822265625, "learning_rate": 1.959386089102636e-05, "loss": 0.0007, "num_tokens": 419795105.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18229922335068705, "frac_reward_zero_std": 1.0, "grad_norm": 4.090748782880054e-11, "kl": 0.017547607421875, "learning_rate": 1.959217849103053e-05, "loss": 0.0007, "num_tokens": 420354497.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18246991550738242, "frac_reward_zero_std": 1.0, "grad_norm": 4.167604080614939e-11, "kl": 0.017059326171875, "learning_rate": 1.9590492686157595e-05, "loss": 0.0007, "num_tokens": 420919473.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18264060766407783, "frac_reward_zero_std": 1.0, "grad_norm": 4.1708056366334466e-11, "kl": 0.0177001953125, "learning_rate": 1.958880347700595e-05, "loss": 0.0007, "num_tokens": 421483329.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18281129982077324, "frac_reward_zero_std": 1.0, "grad_norm": 4.154690149375299e-11, "kl": 0.01751708984375, "learning_rate": 1.958711086417521e-05, "loss": 0.0007, "num_tokens": 422045505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18298199197746864, "frac_reward_zero_std": 1.0, "grad_norm": 4.171784213035609e-11, "kl": 0.017120361328125, "learning_rate": 1.9585414848266185e-05, "loss": 0.0007, "num_tokens": 422631425.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18315268413416402, "frac_reward_zero_std": 1.0, "grad_norm": 4.244625804252547e-11, "kl": 0.017547607421875, "learning_rate": 1.9583715429880904e-05, "loss": 0.0007, "num_tokens": 423190865.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18332337629085943, "frac_reward_zero_std": 1.0, "grad_norm": 3.993271899315906e-11, "kl": 0.0177001953125, "learning_rate": 1.9582012609622598e-05, "loss": 0.0007, "num_tokens": 423757457.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18349406844755484, "frac_reward_zero_std": 1.0, "grad_norm": 4.1438492755302756e-11, "kl": 0.01788330078125, "learning_rate": 1.9580306388095707e-05, "loss": 0.0007, "num_tokens": 424321713.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18366476060425024, "frac_reward_zero_std": 1.0, "grad_norm": 4.3340164060399837e-11, "kl": 0.017364501953125, "learning_rate": 1.9578596765905875e-05, "loss": 0.0007, "num_tokens": 424881713.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18383545276094562, "frac_reward_zero_std": 1.0, "grad_norm": 4.256665687237079e-11, "kl": 0.0167236328125, "learning_rate": 1.9576883743659958e-05, "loss": 0.0007, "num_tokens": 425446705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18400614491764103, "frac_reward_zero_std": 1.0, "grad_norm": 4.185669869293606e-11, "kl": 0.017303466796875, "learning_rate": 1.957516732196602e-05, "loss": 0.0007, "num_tokens": 426012097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18417683707433644, "frac_reward_zero_std": 1.0, "grad_norm": 4.158059302600143e-11, "kl": 0.0174560546875, "learning_rate": 1.9573447501433327e-05, "loss": 0.0007, "num_tokens": 426573665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18434752923103184, "frac_reward_zero_std": 1.0, "grad_norm": 4.2377209383579765e-11, "kl": 0.017303466796875, "learning_rate": 1.957172428267235e-05, "loss": 0.0007, "num_tokens": 427139281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18451822138772722, "frac_reward_zero_std": 1.0, "grad_norm": 4.054449907396399e-11, "kl": 0.017242431640625, "learning_rate": 1.9569997666294774e-05, "loss": 0.0007, "num_tokens": 427708945.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18468891354442263, "frac_reward_zero_std": 1.0, "grad_norm": 4.297692322520249e-11, "kl": 0.017578125, "learning_rate": 1.9568267652913484e-05, "loss": 0.0007, "num_tokens": 428273953.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18485960570111803, "frac_reward_zero_std": 1.0, "grad_norm": 4.1408093808617226e-11, "kl": 0.017730712890625, "learning_rate": 1.956653424314257e-05, "loss": 0.0007, "num_tokens": 428841041.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18503029785781344, "frac_reward_zero_std": 1.0, "grad_norm": 3.7850371998760714e-11, "kl": 0.017059326171875, "learning_rate": 1.9564797437597334e-05, "loss": 0.0007, "num_tokens": 429411233.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18520099001450882, "frac_reward_zero_std": 1.0, "grad_norm": 4.3444560641042766e-11, "kl": 0.017364501953125, "learning_rate": 1.9563057236894276e-05, "loss": 0.0007, "num_tokens": 429975281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18537168217120423, "frac_reward_zero_std": 1.0, "grad_norm": 4.001479795178617e-11, "kl": 0.017059326171875, "learning_rate": 1.9561313641651104e-05, "loss": 0.0007, "num_tokens": 430547009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18554237432789963, "frac_reward_zero_std": 1.0, "grad_norm": 4.355250902432754e-11, "kl": 0.01788330078125, "learning_rate": 1.9559566652486734e-05, "loss": 0.0007, "num_tokens": 431108929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18571306648459504, "frac_reward_zero_std": 1.0, "grad_norm": 4.268659264494527e-11, "kl": 0.018524169921875, "learning_rate": 1.9557816270021284e-05, "loss": 0.0007, "num_tokens": 431673601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18588375864129042, "frac_reward_zero_std": 1.0, "grad_norm": 4.0045522562492155e-11, "kl": 0.017486572265625, "learning_rate": 1.9556062494876072e-05, "loss": 0.0007, "num_tokens": 432243729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18605445079798583, "frac_reward_zero_std": 1.0, "grad_norm": 3.871863499809122e-11, "kl": 0.016815185546875, "learning_rate": 1.9554305327673632e-05, "loss": 0.0007, "num_tokens": 432810241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18622514295468123, "frac_reward_zero_std": 1.0, "grad_norm": 4.3701782685432626e-11, "kl": 0.017852783203125, "learning_rate": 1.9552544769037693e-05, "loss": 0.0007, "num_tokens": 433371617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18639583511137664, "frac_reward_zero_std": 1.0, "grad_norm": 4.134104106647065e-11, "kl": 0.01751708984375, "learning_rate": 1.9550780819593184e-05, "loss": 0.0007, "num_tokens": 433942561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18656652726807202, "frac_reward_zero_std": 1.0, "grad_norm": 4.167085708944971e-11, "kl": 0.017608642578125, "learning_rate": 1.9549013479966247e-05, "loss": 0.0007, "num_tokens": 434503953.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18673721942476743, "frac_reward_zero_std": 1.0, "grad_norm": 3.8586940795277636e-11, "kl": 0.017242431640625, "learning_rate": 1.9547242750784224e-05, "loss": 0.0007, "num_tokens": 435074065.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18690791158146283, "frac_reward_zero_std": 1.0, "grad_norm": 4.274724386286086e-11, "kl": 0.01715087890625, "learning_rate": 1.9545468632675665e-05, "loss": 0.0007, "num_tokens": 435646449.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18707860373815824, "frac_reward_zero_std": 1.0, "grad_norm": 4.0305893127797106e-11, "kl": 0.01708984375, "learning_rate": 1.9543691126270308e-05, "loss": 0.0007, "num_tokens": 436213697.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18724929589485362, "frac_reward_zero_std": 1.0, "grad_norm": 4.331765516945493e-11, "kl": 0.0186767578125, "learning_rate": 1.954191023219911e-05, "loss": 0.0007, "num_tokens": 436774657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18741998805154902, "frac_reward_zero_std": 1.0, "grad_norm": 4.2140375442800625e-11, "kl": 0.0174560546875, "learning_rate": 1.9540125951094226e-05, "loss": 0.0007, "num_tokens": 437336609.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18759068020824443, "frac_reward_zero_std": 1.0, "grad_norm": 4.1409009420596964e-11, "kl": 0.017181396484375, "learning_rate": 1.9538338283589005e-05, "loss": 0.0007, "num_tokens": 437901585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18776137236493984, "frac_reward_zero_std": 1.0, "grad_norm": 4.315382311386127e-11, "kl": 0.017608642578125, "learning_rate": 1.9536547230318006e-05, "loss": 0.0007, "num_tokens": 438465313.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18793206452163522, "frac_reward_zero_std": 1.0, "grad_norm": 3.867097557100216e-11, "kl": 0.017059326171875, "learning_rate": 1.9534752791916995e-05, "loss": 0.0007, "num_tokens": 439033633.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18810275667833062, "frac_reward_zero_std": 1.0, "grad_norm": 4.1754849113525756e-11, "kl": 0.01678466796875, "learning_rate": 1.9532954969022922e-05, "loss": 0.0007, "num_tokens": 439600257.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18827344883502603, "frac_reward_zero_std": 1.0, "grad_norm": 3.849573687244438e-11, "kl": 0.01708984375, "learning_rate": 1.9531153762273957e-05, "loss": 0.0007, "num_tokens": 440166417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18844414099172144, "frac_reward_zero_std": 1.0, "grad_norm": 4.27847423513582e-11, "kl": 0.017364501953125, "learning_rate": 1.9529349172309463e-05, "loss": 0.0007, "num_tokens": 440724145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18861483314841684, "frac_reward_zero_std": 1.0, "grad_norm": 4.2880058019483786e-11, "kl": 0.017333984375, "learning_rate": 1.952754119977e-05, "loss": 0.0007, "num_tokens": 441290705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18878552530511222, "frac_reward_zero_std": 1.0, "grad_norm": 4.4146252088808e-11, "kl": 0.017852783203125, "learning_rate": 1.9525729845297337e-05, "loss": 0.0007, "num_tokens": 441854449.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18895621746180763, "frac_reward_zero_std": 1.0, "grad_norm": 3.9022148105953316e-11, "kl": 0.017578125, "learning_rate": 1.9523915109534437e-05, "loss": 0.0007, "num_tokens": 442423009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18912690961850304, "frac_reward_zero_std": 1.0, "grad_norm": 4.232026797324883e-11, "kl": 0.017974853515625, "learning_rate": 1.952209699312547e-05, "loss": 0.0007, "num_tokens": 442988081.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18929760177519844, "frac_reward_zero_std": 1.0, "grad_norm": 4.139295879069044e-11, "kl": 0.01708984375, "learning_rate": 1.9520275496715796e-05, "loss": 0.0007, "num_tokens": 443553105.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18946829393189382, "frac_reward_zero_std": 1.0, "grad_norm": 4.159010950936529e-11, "kl": 0.018096923828125, "learning_rate": 1.9518450620951982e-05, "loss": 0.0007, "num_tokens": 444125185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18963898608858923, "frac_reward_zero_std": 1.0, "grad_norm": 4.1188596245249874e-11, "kl": 0.01715087890625, "learning_rate": 1.9516622366481794e-05, "loss": 0.0007, "num_tokens": 444688977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18980967824528464, "frac_reward_zero_std": 1.0, "grad_norm": 4.041104582119704e-11, "kl": 0.016876220703125, "learning_rate": 1.9514790733954195e-05, "loss": 0.0007, "num_tokens": 445258673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.18998037040198004, "frac_reward_zero_std": 1.0, "grad_norm": 4.251804422068577e-11, "kl": 0.01690673828125, "learning_rate": 1.9512955724019353e-05, "loss": 0.0007, "num_tokens": 445818529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19015106255867542, "frac_reward_zero_std": 1.0, "grad_norm": 4.1195949994250564e-11, "kl": 0.017974853515625, "learning_rate": 1.9511117337328625e-05, "loss": 0.0007, "num_tokens": 446387217.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19032175471537083, "frac_reward_zero_std": 1.0, "grad_norm": 3.836686808342711e-11, "kl": 0.017364501953125, "learning_rate": 1.9509275574534575e-05, "loss": 0.0007, "num_tokens": 446953377.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19049244687206623, "frac_reward_zero_std": 1.0, "grad_norm": 4.096995044372167e-11, "kl": 0.017669677734375, "learning_rate": 1.950743043629096e-05, "loss": 0.0007, "num_tokens": 447518129.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19066313902876164, "frac_reward_zero_std": 1.0, "grad_norm": 4.055574170499196e-11, "kl": 0.017547607421875, "learning_rate": 1.9505581923252743e-05, "loss": 0.0007, "num_tokens": 448084609.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19083383118545702, "frac_reward_zero_std": 1.0, "grad_norm": 4.061746291301885e-11, "kl": 0.016998291015625, "learning_rate": 1.9503730036076068e-05, "loss": 0.0007, "num_tokens": 448646769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19100452334215243, "frac_reward_zero_std": 1.0, "grad_norm": 4.039167039867605e-11, "kl": 0.017730712890625, "learning_rate": 1.95018747754183e-05, "loss": 0.0007, "num_tokens": 449209217.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19117521549884783, "frac_reward_zero_std": 1.0, "grad_norm": 4.205419921968082e-11, "kl": 0.017913818359375, "learning_rate": 1.9500016141937982e-05, "loss": 0.0007, "num_tokens": 449774401.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19134590765554324, "frac_reward_zero_std": 1.0, "grad_norm": 4.1984372524105485e-11, "kl": 0.0185546875, "learning_rate": 1.9498154136294865e-05, "loss": 0.0007, "num_tokens": 450338961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19151659981223862, "frac_reward_zero_std": 1.0, "grad_norm": 4.265925752450191e-11, "kl": 0.017730712890625, "learning_rate": 1.9496288759149896e-05, "loss": 0.0007, "num_tokens": 450901665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19168729196893403, "frac_reward_zero_std": 1.0, "grad_norm": 4.009146617738196e-11, "kl": 0.017059326171875, "learning_rate": 1.949442001116521e-05, "loss": 0.0007, "num_tokens": 451470705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19185798412562943, "frac_reward_zero_std": 1.0, "grad_norm": 4.223443775017261e-11, "kl": 0.01751708984375, "learning_rate": 1.949254789300415e-05, "loss": 0.0007, "num_tokens": 452037601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19202867628232484, "frac_reward_zero_std": 1.0, "grad_norm": 3.937416811664816e-11, "kl": 0.01690673828125, "learning_rate": 1.9490672405331252e-05, "loss": 0.0007, "num_tokens": 452602353.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19219936843902022, "frac_reward_zero_std": 1.0, "grad_norm": 3.9753987710700534e-11, "kl": 0.017333984375, "learning_rate": 1.9488793548812243e-05, "loss": 0.0007, "num_tokens": 453167313.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19237006059571563, "frac_reward_zero_std": 1.0, "grad_norm": 3.9771779104323794e-11, "kl": 0.01751708984375, "learning_rate": 1.948691132411405e-05, "loss": 0.0007, "num_tokens": 453738673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19254075275241103, "frac_reward_zero_std": 1.0, "grad_norm": 4.004690191631721e-11, "kl": 0.0177001953125, "learning_rate": 1.948502573190479e-05, "loss": 0.0007, "num_tokens": 454308673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19271144490910644, "frac_reward_zero_std": 1.0, "grad_norm": 3.847291730992915e-11, "kl": 0.0164794921875, "learning_rate": 1.9483136772853788e-05, "loss": 0.0007, "num_tokens": 454874545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19288213706580182, "frac_reward_zero_std": 1.0, "grad_norm": 4.147826746997719e-11, "kl": 0.01776123046875, "learning_rate": 1.9481244447631552e-05, "loss": 0.0007, "num_tokens": 455437521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19305282922249722, "frac_reward_zero_std": 1.0, "grad_norm": 4.2616404742975335e-11, "kl": 0.017791748046875, "learning_rate": 1.9479348756909797e-05, "loss": 0.0007, "num_tokens": 456004193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19322352137919263, "frac_reward_zero_std": 1.0, "grad_norm": 3.827177536441267e-11, "kl": 0.01727294921875, "learning_rate": 1.9477449701361412e-05, "loss": 0.0007, "num_tokens": 456574065.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19339421353588804, "frac_reward_zero_std": 1.0, "grad_norm": 4.220328514189768e-11, "kl": 0.0179443359375, "learning_rate": 1.94755472816605e-05, "loss": 0.0007, "num_tokens": 457145025.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19356490569258342, "frac_reward_zero_std": 1.0, "grad_norm": 4.1064309018156825e-11, "kl": 0.017364501953125, "learning_rate": 1.947364149848235e-05, "loss": 0.0007, "num_tokens": 457713505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19373559784927882, "frac_reward_zero_std": 1.0, "grad_norm": 4.399927878095063e-11, "kl": 0.0174560546875, "learning_rate": 1.9471732352503447e-05, "loss": 0.0007, "num_tokens": 458278785.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19390629000597423, "frac_reward_zero_std": 1.0, "grad_norm": 4.309735147150048e-11, "kl": 0.01788330078125, "learning_rate": 1.946981984440147e-05, "loss": 0.0007, "num_tokens": 458841009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19407698216266964, "frac_reward_zero_std": 1.0, "grad_norm": 4.0313386629351397e-11, "kl": 0.0177001953125, "learning_rate": 1.946790397485529e-05, "loss": 0.0007, "num_tokens": 459403921.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19424767431936502, "frac_reward_zero_std": 1.0, "grad_norm": 4.018134175766964e-11, "kl": 0.01751708984375, "learning_rate": 1.946598474454497e-05, "loss": 0.0007, "num_tokens": 459968129.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19441836647606042, "frac_reward_zero_std": 1.0, "grad_norm": 4.0687157620141904e-11, "kl": 0.017852783203125, "learning_rate": 1.9464062154151765e-05, "loss": 0.0007, "num_tokens": 460532913.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19458905863275583, "frac_reward_zero_std": 1.0, "grad_norm": 4.173313171144635e-11, "kl": 0.01776123046875, "learning_rate": 1.9462136204358133e-05, "loss": 0.0007, "num_tokens": 461095633.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19475975078945124, "frac_reward_zero_std": 1.0, "grad_norm": 3.9176379951008676e-11, "kl": 0.017578125, "learning_rate": 1.9460206895847707e-05, "loss": 0.0007, "num_tokens": 461667089.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19493044294614661, "frac_reward_zero_std": 1.0, "grad_norm": 3.984441609927819e-11, "kl": 0.01702880859375, "learning_rate": 1.9458274229305333e-05, "loss": 0.0007, "num_tokens": 462234401.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19510113510284202, "frac_reward_zero_std": 1.0, "grad_norm": 4.396898415656985e-11, "kl": 0.017852783203125, "learning_rate": 1.9456338205417026e-05, "loss": 0.0007, "num_tokens": 462796625.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19527182725953743, "frac_reward_zero_std": 1.0, "grad_norm": 3.862196062414466e-11, "kl": 0.017120361328125, "learning_rate": 1.9454398824870014e-05, "loss": 0.0007, "num_tokens": 463365217.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19544251941623283, "frac_reward_zero_std": 1.0, "grad_norm": 4.079503905910863e-11, "kl": 0.017425537109375, "learning_rate": 1.94524560883527e-05, "loss": 0.0007, "num_tokens": 463926145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19561321157292821, "frac_reward_zero_std": 1.0, "grad_norm": 3.799387248740165e-11, "kl": 0.0174560546875, "learning_rate": 1.945050999655469e-05, "loss": 0.0007, "num_tokens": 464495953.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19578390372962362, "frac_reward_zero_std": 1.0, "grad_norm": 4.131975218386463e-11, "kl": 0.017425537109375, "learning_rate": 1.9448560550166777e-05, "loss": 0.0007, "num_tokens": 465060049.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19595459588631903, "frac_reward_zero_std": 1.0, "grad_norm": 4.2314398035496103e-11, "kl": 0.016937255859375, "learning_rate": 1.9446607749880942e-05, "loss": 0.0007, "num_tokens": 465626977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19612528804301443, "frac_reward_zero_std": 1.0, "grad_norm": 4.344359066183784e-11, "kl": 0.017852783203125, "learning_rate": 1.9444651596390355e-05, "loss": 0.0007, "num_tokens": 466189649.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1962959801997098, "frac_reward_zero_std": 1.0, "grad_norm": 4.352250278015198e-11, "kl": 0.017364501953125, "learning_rate": 1.9442692090389387e-05, "loss": 0.0007, "num_tokens": 466755265.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19646667235640522, "frac_reward_zero_std": 1.0, "grad_norm": 3.929739376039507e-11, "kl": 0.017333984375, "learning_rate": 1.9440729232573588e-05, "loss": 0.0007, "num_tokens": 467325281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19663736451310063, "frac_reward_zero_std": 1.0, "grad_norm": 3.893045640445249e-11, "kl": 0.0172119140625, "learning_rate": 1.9438763023639703e-05, "loss": 0.0007, "num_tokens": 467894433.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19680805666979603, "frac_reward_zero_std": 1.0, "grad_norm": 4.315163748640926e-11, "kl": 0.017608642578125, "learning_rate": 1.9436793464285664e-05, "loss": 0.0007, "num_tokens": 468459041.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1969787488264914, "frac_reward_zero_std": 1.0, "grad_norm": 4.205990361715419e-11, "kl": 0.017852783203125, "learning_rate": 1.9434820555210592e-05, "loss": 0.0007, "num_tokens": 469028433.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19714944098318682, "frac_reward_zero_std": 1.0, "grad_norm": 4.424106601958965e-11, "kl": 0.017578125, "learning_rate": 1.9432844297114802e-05, "loss": 0.0007, "num_tokens": 469593281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19732013313988223, "frac_reward_zero_std": 1.0, "grad_norm": 3.961754975946697e-11, "kl": 0.016998291015625, "learning_rate": 1.9430864690699792e-05, "loss": 0.0007, "num_tokens": 470159073.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19749082529657763, "frac_reward_zero_std": 1.0, "grad_norm": 4.015028648055611e-11, "kl": 0.01715087890625, "learning_rate": 1.942888173666825e-05, "loss": 0.0007, "num_tokens": 470724545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.197661517453273, "frac_reward_zero_std": 1.0, "grad_norm": 4.246333272511839e-11, "kl": 0.0172119140625, "learning_rate": 1.9426895435724056e-05, "loss": 0.0007, "num_tokens": 471292641.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19783220960996842, "frac_reward_zero_std": 1.0, "grad_norm": 4.0953896678655724e-11, "kl": 0.016998291015625, "learning_rate": 1.9424905788572276e-05, "loss": 0.0007, "num_tokens": 471858177.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19800290176666382, "frac_reward_zero_std": 1.0, "grad_norm": 4.024958714619946e-11, "kl": 0.017486572265625, "learning_rate": 1.9422912795919157e-05, "loss": 0.0007, "num_tokens": 472425825.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19817359392335923, "frac_reward_zero_std": 1.0, "grad_norm": 4.035737300300303e-11, "kl": 0.017181396484375, "learning_rate": 1.9420916458472144e-05, "loss": 0.0007, "num_tokens": 472997505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1983442860800546, "frac_reward_zero_std": 1.0, "grad_norm": 4.270504667890817e-11, "kl": 0.0172119140625, "learning_rate": 1.9418916776939865e-05, "loss": 0.0007, "num_tokens": 473562225.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19851497823675002, "frac_reward_zero_std": 1.0, "grad_norm": 4.1384971213578484e-11, "kl": 0.01806640625, "learning_rate": 1.941691375203213e-05, "loss": 0.0007, "num_tokens": 474132609.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19868567039344542, "frac_reward_zero_std": 1.0, "grad_norm": 4.08597486494015e-11, "kl": 0.016937255859375, "learning_rate": 1.9414907384459952e-05, "loss": 0.0007, "num_tokens": 474702497.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19885636255014083, "frac_reward_zero_std": 1.0, "grad_norm": 4.420746147012031e-11, "kl": 0.01788330078125, "learning_rate": 1.9412897674935504e-05, "loss": 0.0007, "num_tokens": 475265873.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1990270547068362, "frac_reward_zero_std": 1.0, "grad_norm": 4.1862359169835964e-11, "kl": 0.01751708984375, "learning_rate": 1.941088462417217e-05, "loss": 0.0007, "num_tokens": 475828097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19919774686353162, "frac_reward_zero_std": 1.0, "grad_norm": 4.299012749911511e-11, "kl": 0.017059326171875, "learning_rate": 1.9408868232884508e-05, "loss": 0.0007, "num_tokens": 476394545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19936843902022702, "frac_reward_zero_std": 1.0, "grad_norm": 3.999700630633995e-11, "kl": 0.01739501953125, "learning_rate": 1.9406848501788264e-05, "loss": 0.0007, "num_tokens": 476964257.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19953913117692243, "frac_reward_zero_std": 1.0, "grad_norm": 4.071637253923771e-11, "kl": 0.017364501953125, "learning_rate": 1.940482543160037e-05, "loss": 0.0007, "num_tokens": 477525953.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.1997098233336178, "frac_reward_zero_std": 1.0, "grad_norm": 3.989365777914052e-11, "kl": 0.01751708984375, "learning_rate": 1.9402799023038938e-05, "loss": 0.0007, "num_tokens": 478087777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19988051549031322, "frac_reward_zero_std": 1.0, "grad_norm": 4.190742193775078e-11, "kl": 0.01751708984375, "learning_rate": 1.940076927682328e-05, "loss": 0.0007, "num_tokens": 478656513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20005120764700862, "frac_reward_zero_std": 1.0, "grad_norm": 4.1015394180073544e-11, "kl": 0.017181396484375, "learning_rate": 1.9398736193673873e-05, "loss": 0.0007, "num_tokens": 479219969.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20022189980370403, "frac_reward_zero_std": 1.0, "grad_norm": 3.805285171065202e-11, "kl": 0.017333984375, "learning_rate": 1.9396699774312395e-05, "loss": 0.0007, "num_tokens": 479789553.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2003925919603994, "frac_reward_zero_std": 1.0, "grad_norm": 4.1768696867090964e-11, "kl": 0.0174560546875, "learning_rate": 1.9394660019461696e-05, "loss": 0.0007, "num_tokens": 480356609.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20056328411709481, "frac_reward_zero_std": 1.0, "grad_norm": 4.246565749815653e-11, "kl": 0.0177001953125, "learning_rate": 1.939261692984582e-05, "loss": 0.0007, "num_tokens": 480917553.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20073397627379022, "frac_reward_zero_std": 1.0, "grad_norm": 4.233603528562647e-11, "kl": 0.01708984375, "learning_rate": 1.939057050618999e-05, "loss": 0.0007, "num_tokens": 481486545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20090466843048563, "frac_reward_zero_std": 1.0, "grad_norm": 4.030342903961818e-11, "kl": 0.0177001953125, "learning_rate": 1.9388520749220605e-05, "loss": 0.0007, "num_tokens": 482051409.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.201075360587181, "frac_reward_zero_std": 1.0, "grad_norm": 3.909172576695636e-11, "kl": 0.015869140625, "learning_rate": 1.9386467659665266e-05, "loss": 0.0006, "num_tokens": 482618801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2012460527438764, "frac_reward_zero_std": 1.0, "grad_norm": 4.0740257004292227e-11, "kl": 0.016845703125, "learning_rate": 1.9384411238252733e-05, "loss": 0.0007, "num_tokens": 483180193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20141674490057182, "frac_reward_zero_std": 1.0, "grad_norm": 4.1196153130927116e-11, "kl": 0.017303466796875, "learning_rate": 1.9382351485712973e-05, "loss": 0.0007, "num_tokens": 483740481.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20158743705726723, "frac_reward_zero_std": 1.0, "grad_norm": 4.454710818473803e-11, "kl": 0.017425537109375, "learning_rate": 1.9380288402777118e-05, "loss": 0.0007, "num_tokens": 484305889.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2017581292139626, "frac_reward_zero_std": 1.0, "grad_norm": 3.936930311752485e-11, "kl": 0.01715087890625, "learning_rate": 1.9378221990177488e-05, "loss": 0.0007, "num_tokens": 484876353.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.201928821370658, "frac_reward_zero_std": 1.0, "grad_norm": 4.182098139584631e-11, "kl": 0.017578125, "learning_rate": 1.9376152248647587e-05, "loss": 0.0007, "num_tokens": 485439297.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20209951352735342, "frac_reward_zero_std": 1.0, "grad_norm": 4.149734860836331e-11, "kl": 0.017791748046875, "learning_rate": 1.9374079178922097e-05, "loss": 0.0007, "num_tokens": 486004241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20227020568404883, "frac_reward_zero_std": 1.0, "grad_norm": 4.221249682391584e-11, "kl": 0.018035888671875, "learning_rate": 1.9372002781736884e-05, "loss": 0.0007, "num_tokens": 486570801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2024408978407442, "frac_reward_zero_std": 1.0, "grad_norm": 4.164844020744148e-11, "kl": 0.017333984375, "learning_rate": 1.9369923057828994e-05, "loss": 0.0007, "num_tokens": 487140193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2026115899974396, "frac_reward_zero_std": 1.0, "grad_norm": 4.4466065973728834e-11, "kl": 0.017425537109375, "learning_rate": 1.9367840007936652e-05, "loss": 0.0007, "num_tokens": 487702193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20278228215413502, "frac_reward_zero_std": 1.0, "grad_norm": 4.3668800922198386e-11, "kl": 0.017669677734375, "learning_rate": 1.936575363279927e-05, "loss": 0.0007, "num_tokens": 488264993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20295297431083043, "frac_reward_zero_std": 1.0, "grad_norm": 4.248251545976425e-11, "kl": 0.017425537109375, "learning_rate": 1.9363663933157425e-05, "loss": 0.0007, "num_tokens": 488827153.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2031236664675258, "frac_reward_zero_std": 1.0, "grad_norm": 4.1298752842392366e-11, "kl": 0.017547607421875, "learning_rate": 1.9361570909752897e-05, "loss": 0.0007, "num_tokens": 489394017.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2032943586242212, "frac_reward_zero_std": 1.0, "grad_norm": 4.149786886591235e-11, "kl": 0.01708984375, "learning_rate": 1.9359474563328632e-05, "loss": 0.0007, "num_tokens": 489954961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20346505078091662, "frac_reward_zero_std": 1.0, "grad_norm": 4.1864302498456646e-11, "kl": 0.017608642578125, "learning_rate": 1.935737489462875e-05, "loss": 0.0007, "num_tokens": 490514177.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20363574293761202, "frac_reward_zero_std": 1.0, "grad_norm": 4.36012118855537e-11, "kl": 0.0181884765625, "learning_rate": 1.9355271904398563e-05, "loss": 0.0007, "num_tokens": 491076241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2038064350943074, "frac_reward_zero_std": 1.0, "grad_norm": 3.985401310583916e-11, "kl": 0.01763916015625, "learning_rate": 1.935316559338456e-05, "loss": 0.0007, "num_tokens": 491640289.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2039771272510028, "frac_reward_zero_std": 1.0, "grad_norm": 4.2583467978454306e-11, "kl": 0.017822265625, "learning_rate": 1.9351055962334398e-05, "loss": 0.0007, "num_tokens": 492210225.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20414781940769822, "frac_reward_zero_std": 1.0, "grad_norm": 4.0919241045082e-11, "kl": 0.01708984375, "learning_rate": 1.9348943011996922e-05, "loss": 0.0007, "num_tokens": 492776929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20431851156439362, "frac_reward_zero_std": 1.0, "grad_norm": 4.048241939349133e-11, "kl": 0.018341064453125, "learning_rate": 1.9346826743122158e-05, "loss": 0.0007, "num_tokens": 493343905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.204489203721089, "frac_reward_zero_std": 1.0, "grad_norm": 4.0804372043842893e-11, "kl": 0.01763916015625, "learning_rate": 1.9344707156461297e-05, "loss": 0.0007, "num_tokens": 493917329.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2046598958777844, "frac_reward_zero_std": 1.0, "grad_norm": 4.2674752303369604e-11, "kl": 0.01751708984375, "learning_rate": 1.9342584252766727e-05, "loss": 0.0007, "num_tokens": 494481009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20483058803447982, "frac_reward_zero_std": 1.0, "grad_norm": 4.2749566308603234e-11, "kl": 0.01739501953125, "learning_rate": 1.9340458032791987e-05, "loss": 0.0007, "num_tokens": 495051489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20500128019117522, "frac_reward_zero_std": 1.0, "grad_norm": 4.068920579399031e-11, "kl": 0.017486572265625, "learning_rate": 1.933832849729182e-05, "loss": 0.0007, "num_tokens": 495615185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20517197234787063, "frac_reward_zero_std": 1.0, "grad_norm": 4.077625494544084e-11, "kl": 0.017242431640625, "learning_rate": 1.933619564702213e-05, "loss": 0.0007, "num_tokens": 496181825.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.205342664504566, "frac_reward_zero_std": 1.0, "grad_norm": 4.2465475830107735e-11, "kl": 0.01708984375, "learning_rate": 1.9334059482740004e-05, "loss": 0.0007, "num_tokens": 496747569.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20551335666126141, "frac_reward_zero_std": 1.0, "grad_norm": 4.208005921084326e-11, "kl": 0.017578125, "learning_rate": 1.93319200052037e-05, "loss": 0.0007, "num_tokens": 497314961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20568404881795682, "frac_reward_zero_std": 1.0, "grad_norm": 4.012512544559708e-11, "kl": 0.017364501953125, "learning_rate": 1.9329777215172657e-05, "loss": 0.0007, "num_tokens": 497879281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20585474097465223, "frac_reward_zero_std": 1.0, "grad_norm": 4.1682392175123263e-11, "kl": 0.01776123046875, "learning_rate": 1.9327631113407492e-05, "loss": 0.0007, "num_tokens": 498444017.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2060254331313476, "frac_reward_zero_std": 1.0, "grad_norm": 4.053554653677862e-11, "kl": 0.0164794921875, "learning_rate": 1.9325481700669985e-05, "loss": 0.0007, "num_tokens": 499010993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20619612528804301, "frac_reward_zero_std": 1.0, "grad_norm": 4.029657666466948e-11, "kl": 0.01702880859375, "learning_rate": 1.932332897772311e-05, "loss": 0.0007, "num_tokens": 499577185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20636681744473842, "frac_reward_zero_std": 1.0, "grad_norm": 4.146810300315916e-11, "kl": 0.017608642578125, "learning_rate": 1.9321172945330996e-05, "loss": 0.0007, "num_tokens": 500140321.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20653750960143383, "frac_reward_zero_std": 1.0, "grad_norm": 3.8233673387251707e-11, "kl": 0.017242431640625, "learning_rate": 1.9319013604258964e-05, "loss": 0.0007, "num_tokens": 500704513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2067082017581292, "frac_reward_zero_std": 1.0, "grad_norm": 4.1567534228104466e-11, "kl": 0.017608642578125, "learning_rate": 1.93168509552735e-05, "loss": 0.0007, "num_tokens": 501270769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2068788939148246, "frac_reward_zero_std": 1.0, "grad_norm": 4.022341238816618e-11, "kl": 0.017730712890625, "learning_rate": 1.9314684999142263e-05, "loss": 0.0007, "num_tokens": 501836497.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20704958607152002, "frac_reward_zero_std": 1.0, "grad_norm": 3.989252571002994e-11, "kl": 0.016632080078125, "learning_rate": 1.9312515736634094e-05, "loss": 0.0007, "num_tokens": 502399137.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20722027822821543, "frac_reward_zero_std": 1.0, "grad_norm": 4.251049679165372e-11, "kl": 0.0177001953125, "learning_rate": 1.9310343168519e-05, "loss": 0.0007, "num_tokens": 502964929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2073909703849108, "frac_reward_zero_std": 1.0, "grad_norm": 3.7220019900651983e-11, "kl": 0.01739501953125, "learning_rate": 1.9308167295568165e-05, "loss": 0.0007, "num_tokens": 503532945.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2075616625416062, "frac_reward_zero_std": 1.0, "grad_norm": 3.7560602737520214e-11, "kl": 0.017913818359375, "learning_rate": 1.930598811855395e-05, "loss": 0.0007, "num_tokens": 504098817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20773235469830162, "frac_reward_zero_std": 1.0, "grad_norm": 4.223118351298649e-11, "kl": 0.018585205078125, "learning_rate": 1.9303805638249873e-05, "loss": 0.0007, "num_tokens": 504661617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20790304685499703, "frac_reward_zero_std": 1.0, "grad_norm": 4.162803952735078e-11, "kl": 0.017669677734375, "learning_rate": 1.9301619855430646e-05, "loss": 0.0007, "num_tokens": 505229777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2080737390116924, "frac_reward_zero_std": 1.0, "grad_norm": 3.748444114195531e-11, "kl": 0.017669677734375, "learning_rate": 1.929943077087214e-05, "loss": 0.0007, "num_tokens": 505806369.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2082444311683878, "frac_reward_zero_std": 1.0, "grad_norm": 4.0749203276655287e-11, "kl": 0.01654052734375, "learning_rate": 1.92972383853514e-05, "loss": 0.0007, "num_tokens": 506372081.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20841512332508322, "frac_reward_zero_std": 1.0, "grad_norm": 3.857555853287168e-11, "kl": 0.016632080078125, "learning_rate": 1.9295042699646645e-05, "loss": 0.0007, "num_tokens": 506939921.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20858581548177862, "frac_reward_zero_std": 1.0, "grad_norm": 3.9720735228813496e-11, "kl": 0.01806640625, "learning_rate": 1.9292843714537262e-05, "loss": 0.0007, "num_tokens": 507505409.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.208756507638474, "frac_reward_zero_std": 1.0, "grad_norm": 3.764708333438797e-11, "kl": 0.017608642578125, "learning_rate": 1.9290641430803813e-05, "loss": 0.0007, "num_tokens": 508077681.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2089271997951694, "frac_reward_zero_std": 1.0, "grad_norm": 3.984881700327947e-11, "kl": 0.016876220703125, "learning_rate": 1.928843584922803e-05, "loss": 0.0007, "num_tokens": 508643857.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20909789195186482, "frac_reward_zero_std": 1.0, "grad_norm": 4.281619676478183e-11, "kl": 0.01788330078125, "learning_rate": 1.9286226970592814e-05, "loss": 0.0007, "num_tokens": 509207905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20926858410856022, "frac_reward_zero_std": 1.0, "grad_norm": 4.422751625259279e-11, "kl": 0.0186767578125, "learning_rate": 1.9284014795682234e-05, "loss": 0.0007, "num_tokens": 509769793.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2094392762652556, "frac_reward_zero_std": 1.0, "grad_norm": 3.89787056459805e-11, "kl": 0.016815185546875, "learning_rate": 1.928179932528154e-05, "loss": 0.0007, "num_tokens": 510336257.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.209609968421951, "frac_reward_zero_std": 1.0, "grad_norm": 4.540409686717258e-11, "kl": 0.0177001953125, "learning_rate": 1.9279580560177135e-05, "loss": 0.0007, "num_tokens": 510902129.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20978066057864642, "frac_reward_zero_std": 1.0, "grad_norm": 3.7324425241825155e-11, "kl": 0.01739501953125, "learning_rate": 1.9277358501156606e-05, "loss": 0.0007, "num_tokens": 511481393.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.20995135273534182, "frac_reward_zero_std": 1.0, "grad_norm": 4.187521009967426e-11, "kl": 0.017822265625, "learning_rate": 1.9275133149008703e-05, "loss": 0.0007, "num_tokens": 512045713.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2101220448920372, "frac_reward_zero_std": 1.0, "grad_norm": 3.897420609245825e-11, "kl": 0.01739501953125, "learning_rate": 1.9272904504523344e-05, "loss": 0.0007, "num_tokens": 512614961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2102927370487326, "frac_reward_zero_std": 1.0, "grad_norm": 3.7506729564967396e-11, "kl": 0.017181396484375, "learning_rate": 1.9270672568491623e-05, "loss": 0.0007, "num_tokens": 513183729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21046342920542802, "frac_reward_zero_std": 1.0, "grad_norm": 4.0698931926590935e-11, "kl": 0.0169677734375, "learning_rate": 1.9268437341705784e-05, "loss": 0.0007, "num_tokens": 513755089.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21063412136212342, "frac_reward_zero_std": 1.0, "grad_norm": 4.234628333956254e-11, "kl": 0.017333984375, "learning_rate": 1.926619882495927e-05, "loss": 0.0007, "num_tokens": 514320801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2108048135188188, "frac_reward_zero_std": 1.0, "grad_norm": 4.0442360149579746e-11, "kl": 0.016937255859375, "learning_rate": 1.9263957019046653e-05, "loss": 0.0007, "num_tokens": 514885537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2109755056755142, "frac_reward_zero_std": 1.0, "grad_norm": 4.4158904811774554e-11, "kl": 0.017822265625, "learning_rate": 1.9261711924763713e-05, "loss": 0.0007, "num_tokens": 515444129.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21114619783220961, "frac_reward_zero_std": 1.0, "grad_norm": 4.083910494883692e-11, "kl": 0.017486572265625, "learning_rate": 1.9259463542907366e-05, "loss": 0.0007, "num_tokens": 516013249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21131688998890502, "frac_reward_zero_std": 1.0, "grad_norm": 4.214658595360899e-11, "kl": 0.01739501953125, "learning_rate": 1.9257211874275708e-05, "loss": 0.0007, "num_tokens": 516576705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2114875821456004, "frac_reward_zero_std": 1.0, "grad_norm": 4.1698606492175526e-11, "kl": 0.01739501953125, "learning_rate": 1.9254956919668e-05, "loss": 0.0007, "num_tokens": 517141953.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2116582743022958, "frac_reward_zero_std": 1.0, "grad_norm": 4.04270666367615e-11, "kl": 0.017425537109375, "learning_rate": 1.9252698679884667e-05, "loss": 0.0007, "num_tokens": 517708385.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2118289664589912, "frac_reward_zero_std": 1.0, "grad_norm": 4.227513704080738e-11, "kl": 0.017913818359375, "learning_rate": 1.925043715572731e-05, "loss": 0.0007, "num_tokens": 518274225.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21199965861568662, "frac_reward_zero_std": 1.0, "grad_norm": 4.265891219622277e-11, "kl": 0.01763916015625, "learning_rate": 1.9248172347998685e-05, "loss": 0.0007, "num_tokens": 518840465.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.212170350772382, "frac_reward_zero_std": 1.0, "grad_norm": 4.156582741525305e-11, "kl": 0.017669677734375, "learning_rate": 1.9245904257502714e-05, "loss": 0.0007, "num_tokens": 519407073.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2123410429290774, "frac_reward_zero_std": 1.0, "grad_norm": 4.1096490484619396e-11, "kl": 0.017425537109375, "learning_rate": 1.9243632885044493e-05, "loss": 0.0007, "num_tokens": 519972817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2125117350857728, "frac_reward_zero_std": 1.0, "grad_norm": 3.9924628338365576e-11, "kl": 0.017242431640625, "learning_rate": 1.924135823143027e-05, "loss": 0.0007, "num_tokens": 520535281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21268242724246822, "frac_reward_zero_std": 1.0, "grad_norm": 4.1016883304059885e-11, "kl": 0.0172119140625, "learning_rate": 1.923908029746747e-05, "loss": 0.0007, "num_tokens": 521100785.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2128531193991636, "frac_reward_zero_std": 1.0, "grad_norm": 3.8203790125960225e-11, "kl": 0.0164794921875, "learning_rate": 1.9236799083964678e-05, "loss": 0.0007, "num_tokens": 521691809.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.213023811555859, "frac_reward_zero_std": 1.0, "grad_norm": 4.399755362532845e-11, "kl": 0.017303466796875, "learning_rate": 1.9234514591731635e-05, "loss": 0.0007, "num_tokens": 522258641.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2131945037125544, "frac_reward_zero_std": 1.0, "grad_norm": 4.387087129328313e-11, "kl": 0.017303466796875, "learning_rate": 1.923222682157926e-05, "loss": 0.0007, "num_tokens": 522822257.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21336519586924982, "frac_reward_zero_std": 1.0, "grad_norm": 4.100734145492022e-11, "kl": 0.017608642578125, "learning_rate": 1.9229935774319634e-05, "loss": 0.0007, "num_tokens": 523396513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2135358880259452, "frac_reward_zero_std": 1.0, "grad_norm": 4.204518589810312e-11, "kl": 0.017669677734375, "learning_rate": 1.9227641450765985e-05, "loss": 0.0007, "num_tokens": 523962161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2137065801826406, "frac_reward_zero_std": 1.0, "grad_norm": 4.089223249925033e-11, "kl": 0.0174560546875, "learning_rate": 1.9225343851732724e-05, "loss": 0.0007, "num_tokens": 524530913.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.213877272339336, "frac_reward_zero_std": 1.0, "grad_norm": 3.997349681237486e-11, "kl": 0.017578125, "learning_rate": 1.9223042978035405e-05, "loss": 0.0007, "num_tokens": 525099569.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21404796449603142, "frac_reward_zero_std": 1.0, "grad_norm": 4.2959768807115493e-11, "kl": 0.016815185546875, "learning_rate": 1.9220738830490764e-05, "loss": 0.0007, "num_tokens": 525663681.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2142186566527268, "frac_reward_zero_std": 1.0, "grad_norm": 3.9065163715005176e-11, "kl": 0.017120361328125, "learning_rate": 1.921843140991669e-05, "loss": 0.0007, "num_tokens": 526232753.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2143893488094222, "frac_reward_zero_std": 1.0, "grad_norm": 4.255740323464848e-11, "kl": 0.01776123046875, "learning_rate": 1.9216120717132227e-05, "loss": 0.0007, "num_tokens": 526797505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2145600409661176, "frac_reward_zero_std": 1.0, "grad_norm": 4.0853948855973216e-11, "kl": 0.0172119140625, "learning_rate": 1.9213806752957594e-05, "loss": 0.0007, "num_tokens": 527364385.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21473073312281302, "frac_reward_zero_std": 1.0, "grad_norm": 4.070841450729366e-11, "kl": 0.016815185546875, "learning_rate": 1.9211489518214165e-05, "loss": 0.0007, "num_tokens": 527934801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2149014252795084, "frac_reward_zero_std": 1.0, "grad_norm": 3.973808448881391e-11, "kl": 0.0177001953125, "learning_rate": 1.9209169013724472e-05, "loss": 0.0007, "num_tokens": 528497073.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2150721174362038, "frac_reward_zero_std": 1.0, "grad_norm": 3.921925986403929e-11, "kl": 0.018341064453125, "learning_rate": 1.9206845240312212e-05, "loss": 0.0007, "num_tokens": 529061953.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2152428095928992, "frac_reward_zero_std": 1.0, "grad_norm": 4.03781557572562e-11, "kl": 0.0174560546875, "learning_rate": 1.920451819880224e-05, "loss": 0.0007, "num_tokens": 529624929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21541350174959462, "frac_reward_zero_std": 1.0, "grad_norm": 4.020624306421938e-11, "kl": 0.017608642578125, "learning_rate": 1.920218789002057e-05, "loss": 0.0007, "num_tokens": 530190577.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21558419390629, "frac_reward_zero_std": 1.0, "grad_norm": 4.089829514967366e-11, "kl": 0.017303466796875, "learning_rate": 1.9199854314794377e-05, "loss": 0.0007, "num_tokens": 530762401.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2157548860629854, "frac_reward_zero_std": 1.0, "grad_norm": 4.055929663786369e-11, "kl": 0.0174560546875, "learning_rate": 1.9197517473952e-05, "loss": 0.0007, "num_tokens": 531321985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2159255782196808, "frac_reward_zero_std": 1.0, "grad_norm": 4.318659098590986e-11, "kl": 0.0177001953125, "learning_rate": 1.9195177368322932e-05, "loss": 0.0007, "num_tokens": 531883633.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21609627037637621, "frac_reward_zero_std": 1.0, "grad_norm": 4.214024484169245e-11, "kl": 0.016845703125, "learning_rate": 1.9192833998737823e-05, "loss": 0.0007, "num_tokens": 532449697.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2162669625330716, "frac_reward_zero_std": 1.0, "grad_norm": 4.232233542483184e-11, "kl": 0.016937255859375, "learning_rate": 1.9190487366028486e-05, "loss": 0.0007, "num_tokens": 533017601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.216437654689767, "frac_reward_zero_std": 1.0, "grad_norm": 4.2900393249102904e-11, "kl": 0.018310546875, "learning_rate": 1.9188137471027898e-05, "loss": 0.0007, "num_tokens": 533576753.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2166083468464624, "frac_reward_zero_std": 1.0, "grad_norm": 4.3681614056663306e-11, "kl": 0.016876220703125, "learning_rate": 1.9185784314570172e-05, "loss": 0.0007, "num_tokens": 534143633.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21677903900315781, "frac_reward_zero_std": 1.0, "grad_norm": 4.168103317783928e-11, "kl": 0.017303466796875, "learning_rate": 1.918342789749061e-05, "loss": 0.0007, "num_tokens": 534710241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2169497311598532, "frac_reward_zero_std": 1.0, "grad_norm": 4.1543902683822583e-11, "kl": 0.017181396484375, "learning_rate": 1.918106822062564e-05, "loss": 0.0007, "num_tokens": 535280289.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2171204233165486, "frac_reward_zero_std": 1.0, "grad_norm": 4.454400703017956e-11, "kl": 0.017974853515625, "learning_rate": 1.917870528481287e-05, "loss": 0.0007, "num_tokens": 535840497.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.217291115473244, "frac_reward_zero_std": 1.0, "grad_norm": 4.068789399055448e-11, "kl": 0.0174560546875, "learning_rate": 1.917633909089106e-05, "loss": 0.0007, "num_tokens": 536411249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2174618076299394, "frac_reward_zero_std": 1.0, "grad_norm": 4.082291843923357e-11, "kl": 0.017364501953125, "learning_rate": 1.9173969639700117e-05, "loss": 0.0007, "num_tokens": 536974497.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2176324997866348, "frac_reward_zero_std": 1.0, "grad_norm": 4.029905158916729e-11, "kl": 0.01702880859375, "learning_rate": 1.9171596932081117e-05, "loss": 0.0007, "num_tokens": 537536897.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2178031919433302, "frac_reward_zero_std": 1.0, "grad_norm": 4.364393927543179e-11, "kl": 0.017364501953125, "learning_rate": 1.916922096887628e-05, "loss": 0.0007, "num_tokens": 538097201.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2179738841000256, "frac_reward_zero_std": 1.0, "grad_norm": 4.214539945068228e-11, "kl": 0.01788330078125, "learning_rate": 1.9166841750928985e-05, "loss": 0.0007, "num_tokens": 538663457.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.218144576256721, "frac_reward_zero_std": 1.0, "grad_norm": 4.2708243660201624e-11, "kl": 0.016998291015625, "learning_rate": 1.9164459279083777e-05, "loss": 0.0007, "num_tokens": 539224593.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2183152684134164, "frac_reward_zero_std": 1.0, "grad_norm": 3.998704344867342e-11, "kl": 0.01739501953125, "learning_rate": 1.916207355418634e-05, "loss": 0.0007, "num_tokens": 539794129.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2184859605701118, "frac_reward_zero_std": 1.0, "grad_norm": 4.086856808141335e-11, "kl": 0.017913818359375, "learning_rate": 1.9159684577083525e-05, "loss": 0.0007, "num_tokens": 540354561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2186566527268072, "frac_reward_zero_std": 1.0, "grad_norm": 4.199187005794378e-11, "kl": 0.017486572265625, "learning_rate": 1.915729234862333e-05, "loss": 0.0007, "num_tokens": 540919073.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2188273448835026, "frac_reward_zero_std": 1.0, "grad_norm": 4.21410047432705e-11, "kl": 0.0177001953125, "learning_rate": 1.9154896869654908e-05, "loss": 0.0007, "num_tokens": 541483409.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.218998037040198, "frac_reward_zero_std": 1.0, "grad_norm": 4.041409238124214e-11, "kl": 0.017608642578125, "learning_rate": 1.9152498141028575e-05, "loss": 0.0007, "num_tokens": 542050945.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2191687291968934, "frac_reward_zero_std": 1.0, "grad_norm": 4.100345834968802e-11, "kl": 0.018035888671875, "learning_rate": 1.9150096163595785e-05, "loss": 0.0007, "num_tokens": 542615185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2193394213535888, "frac_reward_zero_std": 1.0, "grad_norm": 4.018617218847766e-11, "kl": 0.01751708984375, "learning_rate": 1.9147690938209158e-05, "loss": 0.0007, "num_tokens": 543183761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2195101135102842, "frac_reward_zero_std": 1.0, "grad_norm": 4.18208981715528e-11, "kl": 0.0177001953125, "learning_rate": 1.9145282465722457e-05, "loss": 0.0007, "num_tokens": 543749121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2196808056669796, "frac_reward_zero_std": 1.0, "grad_norm": 4.000329656461686e-11, "kl": 0.01727294921875, "learning_rate": 1.914287074699061e-05, "loss": 0.0007, "num_tokens": 544314337.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.219851497823675, "frac_reward_zero_std": 1.0, "grad_norm": 4.1414209684070814e-11, "kl": 0.01739501953125, "learning_rate": 1.9140455782869684e-05, "loss": 0.0007, "num_tokens": 544877329.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2200221899803704, "frac_reward_zero_std": 1.0, "grad_norm": 3.9784626972589845e-11, "kl": 0.017425537109375, "learning_rate": 1.913803757421691e-05, "loss": 0.0007, "num_tokens": 545444833.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2201928821370658, "frac_reward_zero_std": 1.0, "grad_norm": 4.2814679837791335e-11, "kl": 0.017578125, "learning_rate": 1.9135616121890657e-05, "loss": 0.0007, "num_tokens": 546006417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2203635742937612, "frac_reward_zero_std": 1.0, "grad_norm": 4.236560589698576e-11, "kl": 0.0166015625, "learning_rate": 1.9133191426750463e-05, "loss": 0.0007, "num_tokens": 546578817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2205342664504566, "frac_reward_zero_std": 1.0, "grad_norm": 3.9321503438012544e-11, "kl": 0.01776123046875, "learning_rate": 1.9130763489657e-05, "loss": 0.0007, "num_tokens": 547149473.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.220704958607152, "frac_reward_zero_std": 1.0, "grad_norm": 4.060970221409215e-11, "kl": 0.017059326171875, "learning_rate": 1.91283323114721e-05, "loss": 0.0007, "num_tokens": 547716129.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2208756507638474, "frac_reward_zero_std": 1.0, "grad_norm": 4.06022099699792e-11, "kl": 0.01708984375, "learning_rate": 1.9125897893058737e-05, "loss": 0.0007, "num_tokens": 548284641.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2210463429205428, "frac_reward_zero_std": 1.0, "grad_norm": 4.250150289319114e-11, "kl": 0.017303466796875, "learning_rate": 1.9123460235281055e-05, "loss": 0.0007, "num_tokens": 548848609.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2212170350772382, "frac_reward_zero_std": 1.0, "grad_norm": 4.01712726148904e-11, "kl": 0.0177001953125, "learning_rate": 1.9121019339004324e-05, "loss": 0.0007, "num_tokens": 549410785.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2213877272339336, "frac_reward_zero_std": 1.0, "grad_norm": 4.065740479259878e-11, "kl": 0.01715087890625, "learning_rate": 1.9118575205094983e-05, "loss": 0.0007, "num_tokens": 549976337.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.221558419390629, "frac_reward_zero_std": 1.0, "grad_norm": 3.955259204092731e-11, "kl": 0.017822265625, "learning_rate": 1.91161278344206e-05, "loss": 0.0007, "num_tokens": 550538641.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2217291115473244, "frac_reward_zero_std": 1.0, "grad_norm": 3.832034459459922e-11, "kl": 0.017669677734375, "learning_rate": 1.9113677227849908e-05, "loss": 0.0007, "num_tokens": 551107761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2218998037040198, "frac_reward_zero_std": 1.0, "grad_norm": 4.093774489871391e-11, "kl": 0.01690673828125, "learning_rate": 1.911122338625279e-05, "loss": 0.0007, "num_tokens": 551673585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2220704958607152, "frac_reward_zero_std": 1.0, "grad_norm": 4.145381189280251e-11, "kl": 0.017059326171875, "learning_rate": 1.9108766310500265e-05, "loss": 0.0007, "num_tokens": 552240193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2222411880174106, "frac_reward_zero_std": 1.0, "grad_norm": 4.152765406535632e-11, "kl": 0.018035888671875, "learning_rate": 1.9106306001464507e-05, "loss": 0.0007, "num_tokens": 552800993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.222411880174106, "frac_reward_zero_std": 1.0, "grad_norm": 4.125747848678309e-11, "kl": 0.017303466796875, "learning_rate": 1.9103842460018837e-05, "loss": 0.0007, "num_tokens": 553367425.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2225825723308014, "frac_reward_zero_std": 1.0, "grad_norm": 3.852407900805027e-11, "kl": 0.017364501953125, "learning_rate": 1.9101375687037722e-05, "loss": 0.0007, "num_tokens": 553935537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2227532644874968, "frac_reward_zero_std": 1.0, "grad_norm": 4.05943612437263e-11, "kl": 0.016845703125, "learning_rate": 1.909890568339678e-05, "loss": 0.0007, "num_tokens": 554503617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2229239566441922, "frac_reward_zero_std": 1.0, "grad_norm": 4.105180593876671e-11, "kl": 0.0177001953125, "learning_rate": 1.9096432449972772e-05, "loss": 0.0007, "num_tokens": 555069281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2230946488008876, "frac_reward_zero_std": 1.0, "grad_norm": 4.029073704832895e-11, "kl": 0.017242431640625, "learning_rate": 1.9093955987643605e-05, "loss": 0.0007, "num_tokens": 555638369.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.223265340957583, "frac_reward_zero_std": 1.0, "grad_norm": 3.8856528483123814e-11, "kl": 0.016937255859375, "learning_rate": 1.909147629728834e-05, "loss": 0.0007, "num_tokens": 556205089.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2234360331142784, "frac_reward_zero_std": 1.0, "grad_norm": 3.916676726283094e-11, "kl": 0.016815185546875, "learning_rate": 1.908899337978717e-05, "loss": 0.0007, "num_tokens": 556771009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2236067252709738, "frac_reward_zero_std": 1.0, "grad_norm": 4.148755975203185e-11, "kl": 0.017547607421875, "learning_rate": 1.908650723602144e-05, "loss": 0.0007, "num_tokens": 557331873.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2237774174276692, "frac_reward_zero_std": 1.0, "grad_norm": 4.039130790161992e-11, "kl": 0.016845703125, "learning_rate": 1.9084017866873653e-05, "loss": 0.0007, "num_tokens": 557895985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2239481095843646, "frac_reward_zero_std": 1.0, "grad_norm": 4.049310644652193e-11, "kl": 0.01715087890625, "learning_rate": 1.9081525273227433e-05, "loss": 0.0007, "num_tokens": 558461745.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22411880174106, "frac_reward_zero_std": 1.0, "grad_norm": 4.3599084605095935e-11, "kl": 0.01739501953125, "learning_rate": 1.9079029455967567e-05, "loss": 0.0007, "num_tokens": 559025121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2242894938977554, "frac_reward_zero_std": 1.0, "grad_norm": 4.3498460670185165e-11, "kl": 0.018096923828125, "learning_rate": 1.9076530415979976e-05, "loss": 0.0007, "num_tokens": 559588353.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2244601860544508, "frac_reward_zero_std": 1.0, "grad_norm": 4.30575578567677e-11, "kl": 0.01715087890625, "learning_rate": 1.907402815415173e-05, "loss": 0.0007, "num_tokens": 560150753.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2246308782111462, "frac_reward_zero_std": 1.0, "grad_norm": 4.210869151715719e-11, "kl": 0.017669677734375, "learning_rate": 1.907152267137105e-05, "loss": 0.0007, "num_tokens": 560711569.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2248015703678416, "frac_reward_zero_std": 1.0, "grad_norm": 4.15294046863983e-11, "kl": 0.01837158203125, "learning_rate": 1.9069013968527276e-05, "loss": 0.0007, "num_tokens": 561278129.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.224972262524537, "frac_reward_zero_std": 1.0, "grad_norm": 4.080299399220112e-11, "kl": 0.01715087890625, "learning_rate": 1.906650204651092e-05, "loss": 0.0007, "num_tokens": 561847441.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2251429546812324, "frac_reward_zero_std": 1.0, "grad_norm": 3.9939791059111744e-11, "kl": 0.017059326171875, "learning_rate": 1.9063986906213622e-05, "loss": 0.0007, "num_tokens": 562411521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2253136468379278, "frac_reward_zero_std": 1.0, "grad_norm": 4.047917432969205e-11, "kl": 0.01708984375, "learning_rate": 1.9061468548528158e-05, "loss": 0.0007, "num_tokens": 562975297.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2254843389946232, "frac_reward_zero_std": 1.0, "grad_norm": 4.0701794079555085e-11, "kl": 0.0174560546875, "learning_rate": 1.9058946974348464e-05, "loss": 0.0007, "num_tokens": 563539041.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2256550311513186, "frac_reward_zero_std": 1.0, "grad_norm": 3.886757500339715e-11, "kl": 0.016815185546875, "learning_rate": 1.9056422184569604e-05, "loss": 0.0007, "num_tokens": 564107569.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.225825723308014, "frac_reward_zero_std": 1.0, "grad_norm": 4.0439430472592665e-11, "kl": 0.016845703125, "learning_rate": 1.9053894180087784e-05, "loss": 0.0007, "num_tokens": 564671985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2259964154647094, "frac_reward_zero_std": 1.0, "grad_norm": 3.903959075468908e-11, "kl": 0.01739501953125, "learning_rate": 1.9051362961800356e-05, "loss": 0.0007, "num_tokens": 565239409.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2261671076214048, "frac_reward_zero_std": 1.0, "grad_norm": 3.969293588173342e-11, "kl": 0.01715087890625, "learning_rate": 1.9048828530605812e-05, "loss": 0.0007, "num_tokens": 565807217.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2263377997781002, "frac_reward_zero_std": 1.0, "grad_norm": 4.080058217325318e-11, "kl": 0.017181396484375, "learning_rate": 1.9046290887403788e-05, "loss": 0.0007, "num_tokens": 566378145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2265084919347956, "frac_reward_zero_std": 1.0, "grad_norm": 4.388672994549471e-11, "kl": 0.0174560546875, "learning_rate": 1.9043750033095046e-05, "loss": 0.0007, "num_tokens": 566941569.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.226679184091491, "frac_reward_zero_std": 1.0, "grad_norm": 4.464370818294131e-11, "kl": 0.01800537109375, "learning_rate": 1.9041205968581505e-05, "loss": 0.0007, "num_tokens": 567505025.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2268498762481864, "frac_reward_zero_std": 1.0, "grad_norm": 4.077025925316756e-11, "kl": 0.016998291015625, "learning_rate": 1.903865869476621e-05, "loss": 0.0007, "num_tokens": 568069617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2270205684048818, "frac_reward_zero_std": 1.0, "grad_norm": 4.046346970164973e-11, "kl": 0.017578125, "learning_rate": 1.9036108212553364e-05, "loss": 0.0007, "num_tokens": 568640529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2271912605615772, "frac_reward_zero_std": 1.0, "grad_norm": 4.103813357485396e-11, "kl": 0.01739501953125, "learning_rate": 1.9033554522848282e-05, "loss": 0.0007, "num_tokens": 569209505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2273619527182726, "frac_reward_zero_std": 1.0, "grad_norm": 4.42815053174141e-11, "kl": 0.01788330078125, "learning_rate": 1.9030997626557437e-05, "loss": 0.0007, "num_tokens": 569773857.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.227532644874968, "frac_reward_zero_std": 1.0, "grad_norm": 4.1161248624052163e-11, "kl": 0.0164794921875, "learning_rate": 1.9028437524588433e-05, "loss": 0.0007, "num_tokens": 570341777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2277033370316634, "frac_reward_zero_std": 1.0, "grad_norm": 4.044154521463404e-11, "kl": 0.017059326171875, "learning_rate": 1.902587421785002e-05, "loss": 0.0007, "num_tokens": 570906881.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2278740291883588, "frac_reward_zero_std": 1.0, "grad_norm": 4.094514274814138e-11, "kl": 0.017547607421875, "learning_rate": 1.902330770725207e-05, "loss": 0.0007, "num_tokens": 571471137.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22804472134505419, "frac_reward_zero_std": 1.0, "grad_norm": 4.091916776962416e-11, "kl": 0.0174560546875, "learning_rate": 1.902073799370561e-05, "loss": 0.0007, "num_tokens": 572034369.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2282154135017496, "frac_reward_zero_std": 1.0, "grad_norm": 4.1789053020140886e-11, "kl": 0.01678466796875, "learning_rate": 1.9018165078122792e-05, "loss": 0.0007, "num_tokens": 572598273.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.228386105658445, "frac_reward_zero_std": 1.0, "grad_norm": 4.0243267111905104e-11, "kl": 0.01690673828125, "learning_rate": 1.901558896141691e-05, "loss": 0.0007, "num_tokens": 573166241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2285567978151404, "frac_reward_zero_std": 1.0, "grad_norm": 3.979246477831532e-11, "kl": 0.0174560546875, "learning_rate": 1.9013009644502386e-05, "loss": 0.0007, "num_tokens": 573735537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22872748997183578, "frac_reward_zero_std": 1.0, "grad_norm": 3.891969906147388e-11, "kl": 0.017242431640625, "learning_rate": 1.9010427128294792e-05, "loss": 0.0007, "num_tokens": 574302305.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2288981821285312, "frac_reward_zero_std": 1.0, "grad_norm": 4.093336392116044e-11, "kl": 0.016815185546875, "learning_rate": 1.9007841413710827e-05, "loss": 0.0007, "num_tokens": 574862257.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2290688742852266, "frac_reward_zero_std": 1.0, "grad_norm": 4.335916931557139e-11, "kl": 0.017333984375, "learning_rate": 1.9005252501668322e-05, "loss": 0.0007, "num_tokens": 575424513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.229239566441922, "frac_reward_zero_std": 1.0, "grad_norm": 364738332296.64514, "kl": 12675186688.0, "learning_rate": 1.9002660393086253e-05, "loss": 507770880.0, "num_tokens": 576018705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22941025859861738, "frac_reward_zero_std": 1.0, "grad_norm": 6.920911083399232e-11, "kl": 0.017791748046875, "learning_rate": 1.900006508888472e-05, "loss": 0.0007, "num_tokens": 576580993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2295809507553128, "frac_reward_zero_std": 1.0, "grad_norm": 8.00225050377579e-11, "kl": 0.017425537109375, "learning_rate": 1.899746658998496e-05, "loss": 0.0007, "num_tokens": 577152097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2297516429120082, "frac_reward_zero_std": 1.0, "grad_norm": 8.147903246775639e-11, "kl": 0.017578125, "learning_rate": 1.8994864897309355e-05, "loss": 0.0007, "num_tokens": 577714545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2299223350687036, "frac_reward_zero_std": 1.0, "grad_norm": 8.807710470001915e-11, "kl": 0.017730712890625, "learning_rate": 1.8992260011781403e-05, "loss": 0.0007, "num_tokens": 578278977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23009302722539898, "frac_reward_zero_std": 1.0, "grad_norm": 9.007805688732802e-11, "kl": 0.01690673828125, "learning_rate": 1.8989651934325755e-05, "loss": 0.0007, "num_tokens": 578845585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2302637193820944, "frac_reward_zero_std": 1.0, "grad_norm": 1.0114085677797673e-10, "kl": 0.017486572265625, "learning_rate": 1.8987040665868174e-05, "loss": 0.0007, "num_tokens": 579407921.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2304344115387898, "frac_reward_zero_std": 1.0, "grad_norm": 9.762670062766025e-11, "kl": 0.01824951171875, "learning_rate": 1.8984426207335575e-05, "loss": 0.0007, "num_tokens": 579980993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2306051036954852, "frac_reward_zero_std": 1.0, "grad_norm": 1.040184687446733e-10, "kl": 0.01824951171875, "learning_rate": 1.8981808559655987e-05, "loss": 0.0007, "num_tokens": 580547697.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23077579585218058, "frac_reward_zero_std": 1.0, "grad_norm": 1.0576017878343163e-10, "kl": 0.016845703125, "learning_rate": 1.897918772375858e-05, "loss": 0.0007, "num_tokens": 581116161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.230946488008876, "frac_reward_zero_std": 1.0, "grad_norm": 1.157988984852879e-10, "kl": 0.016815185546875, "learning_rate": 1.8976563700573672e-05, "loss": 0.0007, "num_tokens": 581688945.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2311171801655714, "frac_reward_zero_std": 1.0, "grad_norm": 1.0429646912493465e-10, "kl": 0.017181396484375, "learning_rate": 1.8973936491032677e-05, "loss": 0.0007, "num_tokens": 582253873.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2312878723222668, "frac_reward_zero_std": 1.0, "grad_norm": 1.1425897000380666e-10, "kl": 0.0177001953125, "learning_rate": 1.8971306096068175e-05, "loss": 0.0007, "num_tokens": 582816049.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23145856447896218, "frac_reward_zero_std": 1.0, "grad_norm": 1.1520769573548253e-10, "kl": 0.017730712890625, "learning_rate": 1.896867251661385e-05, "loss": 0.0007, "num_tokens": 583379713.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2316292566356576, "frac_reward_zero_std": 1.0, "grad_norm": 1.210392137473539e-10, "kl": 0.017578125, "learning_rate": 1.8966035753604536e-05, "loss": 0.0007, "num_tokens": 583946897.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.231799948792353, "frac_reward_zero_std": 1.0, "grad_norm": 1.2568042731937148e-10, "kl": 0.017822265625, "learning_rate": 1.896339580797618e-05, "loss": 0.0007, "num_tokens": 584512673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2319706409490484, "frac_reward_zero_std": 1.0, "grad_norm": 1.2624816504566903e-10, "kl": 0.01690673828125, "learning_rate": 1.8960752680665876e-05, "loss": 0.0007, "num_tokens": 585079889.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23214133310574378, "frac_reward_zero_std": 1.0, "grad_norm": 1.3426911957986166e-10, "kl": 0.01690673828125, "learning_rate": 1.895810637261183e-05, "loss": 0.0007, "num_tokens": 585652305.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2323120252624392, "frac_reward_zero_std": 1.0, "grad_norm": 1.3019234023714027e-10, "kl": 0.017669677734375, "learning_rate": 1.8955456884753396e-05, "loss": 0.0007, "num_tokens": 586216113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2324827174191346, "frac_reward_zero_std": 1.0, "grad_norm": 1.2143689819051433e-10, "kl": 0.0174560546875, "learning_rate": 1.8952804218031038e-05, "loss": 0.0007, "num_tokens": 586780401.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23265340957583, "frac_reward_zero_std": 1.0, "grad_norm": 1.2024630487598423e-10, "kl": 0.017181396484375, "learning_rate": 1.8950148373386365e-05, "loss": 0.0007, "num_tokens": 587347441.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23282410173252538, "frac_reward_zero_std": 1.0, "grad_norm": 1.2155984349595344e-10, "kl": 0.017669677734375, "learning_rate": 1.8947489351762094e-05, "loss": 0.0007, "num_tokens": 587910401.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23299479388922079, "frac_reward_zero_std": 1.0, "grad_norm": 1.1317299018385331e-10, "kl": 0.016845703125, "learning_rate": 1.8944827154102096e-05, "loss": 0.0007, "num_tokens": 588474321.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2331654860459162, "frac_reward_zero_std": 1.0, "grad_norm": 1.2446939608812862e-10, "kl": 0.017486572265625, "learning_rate": 1.8942161781351346e-05, "loss": 0.0007, "num_tokens": 589049361.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2333361782026116, "frac_reward_zero_std": 1.0, "grad_norm": 1.2930971450568662e-10, "kl": 0.0174560546875, "learning_rate": 1.8939493234455953e-05, "loss": 0.0007, "num_tokens": 589617489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23350687035930698, "frac_reward_zero_std": 1.0, "grad_norm": 1.0973996601782152e-10, "kl": 0.017730712890625, "learning_rate": 1.893682151436316e-05, "loss": 0.0007, "num_tokens": 590177793.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23367756251600239, "frac_reward_zero_std": 1.0, "grad_norm": 1.1623381808864348e-10, "kl": 0.01763916015625, "learning_rate": 1.8934146622021336e-05, "loss": 0.0007, "num_tokens": 590745121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2338482546726978, "frac_reward_zero_std": 1.0, "grad_norm": 1.2152465387135984e-10, "kl": 0.017791748046875, "learning_rate": 1.893146855837996e-05, "loss": 0.0007, "num_tokens": 591311121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2340189468293932, "frac_reward_zero_std": 1.0, "grad_norm": 1.255018680153965e-10, "kl": 0.017242431640625, "learning_rate": 1.8928787324389656e-05, "loss": 0.0007, "num_tokens": 591876753.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23418963898608858, "frac_reward_zero_std": 1.0, "grad_norm": 1.182105801885832e-10, "kl": 0.017791748046875, "learning_rate": 1.892610292100216e-05, "loss": 0.0007, "num_tokens": 592441585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23436033114278398, "frac_reward_zero_std": 1.0, "grad_norm": 1.1590433201431263e-10, "kl": 0.01708984375, "learning_rate": 1.8923415349170346e-05, "loss": 0.0007, "num_tokens": 593009729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2345310232994794, "frac_reward_zero_std": 1.0, "grad_norm": 1.1840384544612228e-10, "kl": 0.016876220703125, "learning_rate": 1.8920724609848193e-05, "loss": 0.0007, "num_tokens": 593575169.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2347017154561748, "frac_reward_zero_std": 1.0, "grad_norm": 1.2850261438753765e-10, "kl": 0.016845703125, "learning_rate": 1.8918030703990827e-05, "loss": 0.0007, "num_tokens": 594165473.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23487240761287018, "frac_reward_zero_std": 1.0, "grad_norm": 1.238609398869575e-10, "kl": 0.016937255859375, "learning_rate": 1.8915333632554483e-05, "loss": 0.0007, "num_tokens": 594732881.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23504309976956558, "frac_reward_zero_std": 1.0, "grad_norm": 1.1723481621640436e-10, "kl": 0.0167236328125, "learning_rate": 1.8912633396496522e-05, "loss": 0.0007, "num_tokens": 595298993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.235213791926261, "frac_reward_zero_std": 1.0, "grad_norm": 1.2807433130018243e-10, "kl": 0.016937255859375, "learning_rate": 1.8909929996775436e-05, "loss": 0.0007, "num_tokens": 595867073.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2353844840829564, "frac_reward_zero_std": 1.0, "grad_norm": 1.2551884675830696e-10, "kl": 0.017669677734375, "learning_rate": 1.890722343435083e-05, "loss": 0.0007, "num_tokens": 596434833.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23555517623965178, "frac_reward_zero_std": 1.0, "grad_norm": 1.1074082185211884e-10, "kl": 0.01715087890625, "learning_rate": 1.890451371018344e-05, "loss": 0.0007, "num_tokens": 597000577.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23572586839634718, "frac_reward_zero_std": 1.0, "grad_norm": 1.2058394805419076e-10, "kl": 0.017333984375, "learning_rate": 1.8901800825235113e-05, "loss": 0.0007, "num_tokens": 597565137.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2358965605530426, "frac_reward_zero_std": 1.0, "grad_norm": 1.3114253842551551e-10, "kl": 0.017120361328125, "learning_rate": 1.889908478046883e-05, "loss": 0.0007, "num_tokens": 598131377.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.236067252709738, "frac_reward_zero_std": 1.0, "grad_norm": 1.2513858058078517e-10, "kl": 0.0179443359375, "learning_rate": 1.889636557684869e-05, "loss": 0.0007, "num_tokens": 598700209.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23623794486643337, "frac_reward_zero_std": 1.0, "grad_norm": 1.1993263087230383e-10, "kl": 0.017181396484375, "learning_rate": 1.889364321533991e-05, "loss": 0.0007, "num_tokens": 599274577.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23640863702312878, "frac_reward_zero_std": 1.0, "grad_norm": 1.089870297897644e-10, "kl": 0.01776123046875, "learning_rate": 1.889091769690883e-05, "loss": 0.0007, "num_tokens": 599835873.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2365793291798242, "frac_reward_zero_std": 1.0, "grad_norm": 1.2281867328411056e-10, "kl": 0.0172119140625, "learning_rate": 1.8888189022522914e-05, "loss": 0.0007, "num_tokens": 600398465.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2367500213365196, "frac_reward_zero_std": 1.0, "grad_norm": 1.378826403798527e-10, "kl": 0.017242431640625, "learning_rate": 1.888545719315074e-05, "loss": 0.0007, "num_tokens": 600965233.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23692071349321497, "frac_reward_zero_std": 1.0, "grad_norm": 1.1968188787005427e-10, "kl": 0.01824951171875, "learning_rate": 1.8882722209762002e-05, "loss": 0.0007, "num_tokens": 601525185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23709140564991038, "frac_reward_zero_std": 1.0, "grad_norm": 1.2503565010163723e-10, "kl": 0.018585205078125, "learning_rate": 1.8879984073327536e-05, "loss": 0.0007, "num_tokens": 602086449.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2372620978066058, "frac_reward_zero_std": 1.0, "grad_norm": 1.2388132937519357e-10, "kl": 0.01702880859375, "learning_rate": 1.887724278481927e-05, "loss": 0.0007, "num_tokens": 602652993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2374327899633012, "frac_reward_zero_std": 1.0, "grad_norm": 1.3161031150362403e-10, "kl": 0.016876220703125, "learning_rate": 1.8874498345210265e-05, "loss": 0.0007, "num_tokens": 603220049.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23760348211999657, "frac_reward_zero_std": 1.0, "grad_norm": 1.246024680278966e-10, "kl": 0.017730712890625, "learning_rate": 1.8871750755474697e-05, "loss": 0.0007, "num_tokens": 603786641.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23777417427669198, "frac_reward_zero_std": 1.0, "grad_norm": 1.1701669043568292e-10, "kl": 0.0177001953125, "learning_rate": 1.8869000016587865e-05, "loss": 0.0007, "num_tokens": 604351041.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2379448664333874, "frac_reward_zero_std": 1.0, "grad_norm": 1.3758422094599992e-10, "kl": 0.01776123046875, "learning_rate": 1.886624612952618e-05, "loss": 0.0007, "num_tokens": 604916993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2381155585900828, "frac_reward_zero_std": 1.0, "grad_norm": 1.2182661931070437e-10, "kl": 0.01837158203125, "learning_rate": 1.886348909526717e-05, "loss": 0.0007, "num_tokens": 605478145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23828625074677817, "frac_reward_zero_std": 1.0, "grad_norm": 1.2715882244215074e-10, "kl": 0.017486572265625, "learning_rate": 1.8860728914789485e-05, "loss": 0.0007, "num_tokens": 606045569.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23845694290347358, "frac_reward_zero_std": 1.0, "grad_norm": 1.2453425031035607e-10, "kl": 0.01690673828125, "learning_rate": 1.885796558907289e-05, "loss": 0.0007, "num_tokens": 606610529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23862763506016899, "frac_reward_zero_std": 1.0, "grad_norm": 1.1538656124248877e-10, "kl": 0.01788330078125, "learning_rate": 1.8855199119098266e-05, "loss": 0.0007, "num_tokens": 607171937.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2387983272168644, "frac_reward_zero_std": 1.0, "grad_norm": 1.2269315136433564e-10, "kl": 0.01751708984375, "learning_rate": 1.8852429505847607e-05, "loss": 0.0007, "num_tokens": 607734625.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23896901937355977, "frac_reward_zero_std": 1.0, "grad_norm": 1.2068417626563592e-10, "kl": 0.0177001953125, "learning_rate": 1.884965675030403e-05, "loss": 0.0007, "num_tokens": 608296785.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23913971153025518, "frac_reward_zero_std": 1.0, "grad_norm": 1.1385992261116272e-10, "kl": 0.01763916015625, "learning_rate": 1.8846880853451762e-05, "loss": 0.0007, "num_tokens": 608856673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23931040368695058, "frac_reward_zero_std": 1.0, "grad_norm": 1.3786972411472846e-10, "kl": 0.0166015625, "learning_rate": 1.884410181627614e-05, "loss": 0.0007, "num_tokens": 609430017.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.239481095843646, "frac_reward_zero_std": 1.0, "grad_norm": 1.3162233806316992e-10, "kl": 0.017791748046875, "learning_rate": 1.884131963976363e-05, "loss": 0.0007, "num_tokens": 609996145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2396517880003414, "frac_reward_zero_std": 1.0, "grad_norm": 1.1273236925756705e-10, "kl": 0.017730712890625, "learning_rate": 1.88385343249018e-05, "loss": 0.0007, "num_tokens": 610567137.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23982248015703678, "frac_reward_zero_std": 1.0, "grad_norm": 1.3471337819709622e-10, "kl": 0.01776123046875, "learning_rate": 1.883574587267934e-05, "loss": 0.0007, "num_tokens": 611129601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23999317231373218, "frac_reward_zero_std": 1.0, "grad_norm": 1.1969553129261948e-10, "kl": 0.017242431640625, "learning_rate": 1.8832954284086046e-05, "loss": 0.0007, "num_tokens": 611690081.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2401638644704276, "frac_reward_zero_std": 1.0, "grad_norm": 1.1586145508798261e-10, "kl": 0.016845703125, "learning_rate": 1.8830159560112835e-05, "loss": 0.0007, "num_tokens": 612256321.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.240334556627123, "frac_reward_zero_std": 1.0, "grad_norm": 1.1853721141747036e-10, "kl": 0.01788330078125, "learning_rate": 1.8827361701751725e-05, "loss": 0.0007, "num_tokens": 612819969.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24050524878381838, "frac_reward_zero_std": 1.0, "grad_norm": 1.2652933329305486e-10, "kl": 0.01739501953125, "learning_rate": 1.8824560709995864e-05, "loss": 0.0007, "num_tokens": 613381857.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24067594094051378, "frac_reward_zero_std": 1.0, "grad_norm": 1.2653665116270893e-10, "kl": 0.017730712890625, "learning_rate": 1.88217565858395e-05, "loss": 0.0007, "num_tokens": 613951073.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2408466330972092, "frac_reward_zero_std": 1.0, "grad_norm": 1.3269073747669621e-10, "kl": 0.01751708984375, "learning_rate": 1.881894933027799e-05, "loss": 0.0007, "num_tokens": 614519489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2410173252539046, "frac_reward_zero_std": 1.0, "grad_norm": 1.180460296350334e-10, "kl": 0.017242431640625, "learning_rate": 1.881613894430782e-05, "loss": 0.0007, "num_tokens": 615086161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24118801741059998, "frac_reward_zero_std": 1.0, "grad_norm": 1.2186142259138643e-10, "kl": 0.017486572265625, "learning_rate": 1.8813325428926565e-05, "loss": 0.0007, "num_tokens": 615653249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24135870956729538, "frac_reward_zero_std": 1.0, "grad_norm": 1.2150792835443807e-10, "kl": 0.01715087890625, "learning_rate": 1.8810508785132925e-05, "loss": 0.0007, "num_tokens": 616222705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2415294017239908, "frac_reward_zero_std": 1.0, "grad_norm": 1.246740879496677e-10, "kl": 0.016693115234375, "learning_rate": 1.8807689013926712e-05, "loss": 0.0007, "num_tokens": 616790945.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2417000938806862, "frac_reward_zero_std": 1.0, "grad_norm": 1.288202298822152e-10, "kl": 0.017181396484375, "learning_rate": 1.8804866116308835e-05, "loss": 0.0007, "num_tokens": 617358097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24187078603738157, "frac_reward_zero_std": 1.0, "grad_norm": 1.2085716219171763e-10, "kl": 0.016754150390625, "learning_rate": 1.8802040093281323e-05, "loss": 0.0007, "num_tokens": 617920609.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24204147819407698, "frac_reward_zero_std": 1.0, "grad_norm": 1.0957917279302298e-10, "kl": 0.0174560546875, "learning_rate": 1.8799210945847318e-05, "loss": 0.0007, "num_tokens": 618481537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2422121703507724, "frac_reward_zero_std": 1.0, "grad_norm": 1.2557262176840816e-10, "kl": 0.01776123046875, "learning_rate": 1.8796378675011057e-05, "loss": 0.0007, "num_tokens": 619044737.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2423828625074678, "frac_reward_zero_std": 1.0, "grad_norm": 1.377333519099263e-10, "kl": 0.01776123046875, "learning_rate": 1.8793543281777897e-05, "loss": 0.0007, "num_tokens": 619606929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24255355466416317, "frac_reward_zero_std": 1.0, "grad_norm": 1.2865837024808315e-10, "kl": 0.017852783203125, "learning_rate": 1.8790704767154302e-05, "loss": 0.0007, "num_tokens": 620171649.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24272424682085858, "frac_reward_zero_std": 1.0, "grad_norm": 1.2986129077685814e-10, "kl": 0.017364501953125, "learning_rate": 1.878786313214784e-05, "loss": 0.0007, "num_tokens": 620737697.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.242894938977554, "frac_reward_zero_std": 1.0, "grad_norm": 1.1210874364973585e-10, "kl": 0.017547607421875, "learning_rate": 1.878501837776719e-05, "loss": 0.0007, "num_tokens": 621303537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2430656311342494, "frac_reward_zero_std": 1.0, "grad_norm": 1.1764007675718773e-10, "kl": 0.017608642578125, "learning_rate": 1.8782170505022137e-05, "loss": 0.0007, "num_tokens": 621867457.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24323632329094477, "frac_reward_zero_std": 1.0, "grad_norm": 1.1022307332738848e-10, "kl": 0.01751708984375, "learning_rate": 1.8779319514923574e-05, "loss": 0.0007, "num_tokens": 622431841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24340701544764018, "frac_reward_zero_std": 1.0, "grad_norm": 1.1716887324360403e-10, "kl": 0.017181396484375, "learning_rate": 1.87764654084835e-05, "loss": 0.0007, "num_tokens": 622996129.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24357770760433559, "frac_reward_zero_std": 1.0, "grad_norm": 1.3098310420087136e-10, "kl": 0.0174560546875, "learning_rate": 1.877360818671501e-05, "loss": 0.0007, "num_tokens": 623564241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.243748399761031, "frac_reward_zero_std": 1.0, "grad_norm": 1.368225321513821e-10, "kl": 0.017578125, "learning_rate": 1.877074785063233e-05, "loss": 0.0007, "num_tokens": 624127393.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24391909191772637, "frac_reward_zero_std": 1.0, "grad_norm": 1.2006780194287353e-10, "kl": 0.0172119140625, "learning_rate": 1.8767884401250766e-05, "loss": 0.0007, "num_tokens": 624689857.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24408978407442178, "frac_reward_zero_std": 1.0, "grad_norm": 1.131049790540192e-10, "kl": 0.017425537109375, "learning_rate": 1.8765017839586742e-05, "loss": 0.0007, "num_tokens": 625252049.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24426047623111719, "frac_reward_zero_std": 1.0, "grad_norm": 1.442486819712523e-10, "kl": 0.018402099609375, "learning_rate": 1.8762148166657787e-05, "loss": 0.0007, "num_tokens": 625820433.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2444311683878126, "frac_reward_zero_std": 1.0, "grad_norm": 1.2082973074988472e-10, "kl": 0.017547607421875, "learning_rate": 1.8759275383482525e-05, "loss": 0.0007, "num_tokens": 626384321.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24460186054450797, "frac_reward_zero_std": 1.0, "grad_norm": 1.330198486920272e-10, "kl": 0.0172119140625, "learning_rate": 1.8756399491080696e-05, "loss": 0.0007, "num_tokens": 626946273.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24477255270120338, "frac_reward_zero_std": 1.0, "grad_norm": 1.2024599785370673e-10, "kl": 0.0172119140625, "learning_rate": 1.8753520490473134e-05, "loss": 0.0007, "num_tokens": 627512369.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24494324485789878, "frac_reward_zero_std": 1.0, "grad_norm": 1.1522596301743322e-10, "kl": 0.016998291015625, "learning_rate": 1.875063838268178e-05, "loss": 0.0007, "num_tokens": 628076433.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2451139370145942, "frac_reward_zero_std": 1.0, "grad_norm": 1.2493058583826253e-10, "kl": 0.017578125, "learning_rate": 1.8747753168729687e-05, "loss": 0.0007, "num_tokens": 628641345.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24528462917128957, "frac_reward_zero_std": 1.0, "grad_norm": 1.2476811925130825e-10, "kl": 0.01690673828125, "learning_rate": 1.8744864849640988e-05, "loss": 0.0007, "num_tokens": 629208081.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24545532132798498, "frac_reward_zero_std": 1.0, "grad_norm": 1.2214594006742134e-10, "kl": 0.017578125, "learning_rate": 1.8741973426440944e-05, "loss": 0.0007, "num_tokens": 629772241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24562601348468038, "frac_reward_zero_std": 1.0, "grad_norm": 1.202283341027257e-10, "kl": 0.018218994140625, "learning_rate": 1.8739078900155894e-05, "loss": 0.0007, "num_tokens": 630338449.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2457967056413758, "frac_reward_zero_std": 1.0, "grad_norm": 1.1247448786725767e-10, "kl": 0.017669677734375, "learning_rate": 1.87361812718133e-05, "loss": 0.0007, "num_tokens": 630896401.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24596739779807117, "frac_reward_zero_std": 1.0, "grad_norm": 1.186220753419818e-10, "kl": 0.0184326171875, "learning_rate": 1.8733280542441713e-05, "loss": 0.0007, "num_tokens": 631462353.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24613808995476658, "frac_reward_zero_std": 1.0, "grad_norm": 1.1423080562566036e-10, "kl": 0.017974853515625, "learning_rate": 1.873037671307079e-05, "loss": 0.0007, "num_tokens": 632021281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24630878211146198, "frac_reward_zero_std": 1.0, "grad_norm": 1.3269048267474916e-10, "kl": 0.017425537109375, "learning_rate": 1.8727469784731277e-05, "loss": 0.0007, "num_tokens": 632586913.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2464794742681574, "frac_reward_zero_std": 1.0, "grad_norm": 1.1030817517424569e-10, "kl": 0.017181396484375, "learning_rate": 1.8724559758455037e-05, "loss": 0.0007, "num_tokens": 633142481.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24665016642485277, "frac_reward_zero_std": 1.0, "grad_norm": 1.296432709909829e-10, "kl": 0.016937255859375, "learning_rate": 1.8721646635275022e-05, "loss": 0.0007, "num_tokens": 633706945.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24682085858154817, "frac_reward_zero_std": 1.0, "grad_norm": 1.2804334789290953e-10, "kl": 0.017120361328125, "learning_rate": 1.871873041622528e-05, "loss": 0.0007, "num_tokens": 634270433.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24699155073824358, "frac_reward_zero_std": 1.0, "grad_norm": 1.3167781163397613e-10, "kl": 0.017730712890625, "learning_rate": 1.8715811102340975e-05, "loss": 0.0007, "num_tokens": 634842049.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.247162242894939, "frac_reward_zero_std": 1.0, "grad_norm": 1.1371275769656544e-10, "kl": 0.017913818359375, "learning_rate": 1.871288869465835e-05, "loss": 0.0007, "num_tokens": 635406065.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24733293505163437, "frac_reward_zero_std": 1.0, "grad_norm": 1.0807111195737444e-10, "kl": 0.017547607421875, "learning_rate": 1.8709963194214752e-05, "loss": 0.0007, "num_tokens": 635966225.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24750362720832977, "frac_reward_zero_std": 1.0, "grad_norm": 1.1883033369282608e-10, "kl": 0.017578125, "learning_rate": 1.8707034602048636e-05, "loss": 0.0007, "num_tokens": 636536289.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24767431936502518, "frac_reward_zero_std": 1.0, "grad_norm": 1.346654433012128e-10, "kl": 0.017181396484375, "learning_rate": 1.870410291919954e-05, "loss": 0.0007, "num_tokens": 637109329.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2478450115217206, "frac_reward_zero_std": 1.0, "grad_norm": 1.267259587185936e-10, "kl": 0.0181884765625, "learning_rate": 1.8701168146708104e-05, "loss": 0.0007, "num_tokens": 637672801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24801570367841597, "frac_reward_zero_std": 1.0, "grad_norm": 1.303817865909875e-10, "kl": 0.01751708984375, "learning_rate": 1.8698230285616073e-05, "loss": 0.0007, "num_tokens": 638257921.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24818639583511137, "frac_reward_zero_std": 1.0, "grad_norm": 1.0984506312315775e-10, "kl": 0.017242431640625, "learning_rate": 1.869528933696628e-05, "loss": 0.0007, "num_tokens": 638821873.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24835708799180678, "frac_reward_zero_std": 1.0, "grad_norm": 1.3720783088199765e-10, "kl": 0.018463134765625, "learning_rate": 1.8692345301802654e-05, "loss": 0.0007, "num_tokens": 639384193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2485277801485022, "frac_reward_zero_std": 1.0, "grad_norm": 1.3539202679686088e-10, "kl": 0.017059326171875, "learning_rate": 1.868939818117022e-05, "loss": 0.0007, "num_tokens": 639953249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24869847230519757, "frac_reward_zero_std": 1.0, "grad_norm": 1.3624859057830874e-10, "kl": 0.016998291015625, "learning_rate": 1.8686447976115103e-05, "loss": 0.0007, "num_tokens": 640521473.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24886916446189297, "frac_reward_zero_std": 1.0, "grad_norm": 1.2223121633908695e-10, "kl": 0.017242431640625, "learning_rate": 1.8683494687684517e-05, "loss": 0.0007, "num_tokens": 641082977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24903985661858838, "frac_reward_zero_std": 1.0, "grad_norm": 1.2871874879538237e-10, "kl": 0.017303466796875, "learning_rate": 1.868053831692677e-05, "loss": 0.0007, "num_tokens": 641645857.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24921054877528379, "frac_reward_zero_std": 1.0, "grad_norm": 1.1502625464374575e-10, "kl": 0.01727294921875, "learning_rate": 1.8677578864891274e-05, "loss": 0.0007, "num_tokens": 642209169.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24938124093197916, "frac_reward_zero_std": 1.0, "grad_norm": 1.279760096961811e-10, "kl": 0.0174560546875, "learning_rate": 1.867461633262852e-05, "loss": 0.0007, "num_tokens": 642782465.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24955193308867457, "frac_reward_zero_std": 1.0, "grad_norm": 1.1027304262436832e-10, "kl": 0.01727294921875, "learning_rate": 1.8671650721190105e-05, "loss": 0.0007, "num_tokens": 643342113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24972262524536998, "frac_reward_zero_std": 1.0, "grad_norm": 1.1174365817086527e-10, "kl": 0.018035888671875, "learning_rate": 1.8668682031628712e-05, "loss": 0.0007, "num_tokens": 643904929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24989331740206538, "frac_reward_zero_std": 1.0, "grad_norm": 1.2116372990590254e-10, "kl": 0.0172119140625, "learning_rate": 1.8665710264998124e-05, "loss": 0.0007, "num_tokens": 644468417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2500640095587608, "frac_reward_zero_std": 1.0, "grad_norm": 1.1137820752043579e-10, "kl": 0.0177001953125, "learning_rate": 1.86627354223532e-05, "loss": 0.0007, "num_tokens": 645037073.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2502347017154562, "frac_reward_zero_std": 1.0, "grad_norm": 1.486192435109716e-10, "kl": 0.017730712890625, "learning_rate": 1.8659757504749908e-05, "loss": 0.0007, "num_tokens": 645607969.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25040539387215155, "frac_reward_zero_std": 1.0, "grad_norm": 1.3313006978000417e-10, "kl": 0.01824951171875, "learning_rate": 1.8656776513245308e-05, "loss": 0.0007, "num_tokens": 646176945.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25057608602884696, "frac_reward_zero_std": 1.0, "grad_norm": 1.217572904273556e-10, "kl": 0.0174560546875, "learning_rate": 1.865379244889753e-05, "loss": 0.0007, "num_tokens": 646744145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25074677818554236, "frac_reward_zero_std": 1.0, "grad_norm": 1.2600817170948762e-10, "kl": 0.01812744140625, "learning_rate": 1.865080531276582e-05, "loss": 0.0007, "num_tokens": 647307729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25091747034223777, "frac_reward_zero_std": 1.0, "grad_norm": 1.3042010958294065e-10, "kl": 0.016754150390625, "learning_rate": 1.8647815105910498e-05, "loss": 0.0007, "num_tokens": 647876465.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2510881624989332, "frac_reward_zero_std": 1.0, "grad_norm": 1.3094161017813978e-10, "kl": 0.01739501953125, "learning_rate": 1.8644821829392984e-05, "loss": 0.0007, "num_tokens": 648445937.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2512588546556286, "frac_reward_zero_std": 1.0, "grad_norm": 1.234218557873776e-10, "kl": 0.0177001953125, "learning_rate": 1.864182548427578e-05, "loss": 0.0007, "num_tokens": 649008801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.251429546812324, "frac_reward_zero_std": 1.0, "grad_norm": 1.2151292090680379e-10, "kl": 0.018096923828125, "learning_rate": 1.863882607162248e-05, "loss": 0.0007, "num_tokens": 649570833.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2516002389690194, "frac_reward_zero_std": 1.0, "grad_norm": 1.2044074915350886e-10, "kl": 0.01739501953125, "learning_rate": 1.863582359249777e-05, "loss": 0.0007, "num_tokens": 650133057.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25177093112571475, "frac_reward_zero_std": 1.0, "grad_norm": 1.1603409304952092e-10, "kl": 0.0174560546875, "learning_rate": 1.863281804796742e-05, "loss": 0.0007, "num_tokens": 650693361.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25194162328241015, "frac_reward_zero_std": 1.0, "grad_norm": 1.1982262325058685e-10, "kl": 0.017242431640625, "learning_rate": 1.862980943909829e-05, "loss": 0.0007, "num_tokens": 651260193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25211231543910556, "frac_reward_zero_std": 1.0, "grad_norm": 1.424147659150571e-10, "kl": 0.01763916015625, "learning_rate": 1.8626797766958323e-05, "loss": 0.0007, "num_tokens": 651827745.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25228300759580097, "frac_reward_zero_std": 1.0, "grad_norm": 1.2532149194918676e-10, "kl": 0.017333984375, "learning_rate": 1.8623783032616562e-05, "loss": 0.0007, "num_tokens": 652389441.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2524536997524964, "frac_reward_zero_std": 1.0, "grad_norm": 1.3300199640733193e-10, "kl": 0.017578125, "learning_rate": 1.8620765237143127e-05, "loss": 0.0007, "num_tokens": 652957905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2526243919091918, "frac_reward_zero_std": 1.0, "grad_norm": 1.3214825748531695e-10, "kl": 0.017730712890625, "learning_rate": 1.861774438160922e-05, "loss": 0.0007, "num_tokens": 653521105.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2527950840658872, "frac_reward_zero_std": 1.0, "grad_norm": 1.315753881795293e-10, "kl": 0.01788330078125, "learning_rate": 1.861472046708714e-05, "loss": 0.0007, "num_tokens": 654084609.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2529657762225826, "frac_reward_zero_std": 1.0, "grad_norm": 1.1925826813627107e-10, "kl": 0.0172119140625, "learning_rate": 1.861169349465027e-05, "loss": 0.0007, "num_tokens": 654650689.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25313646837927795, "frac_reward_zero_std": 1.0, "grad_norm": 1.314543952044657e-10, "kl": 0.017578125, "learning_rate": 1.8608663465373077e-05, "loss": 0.0007, "num_tokens": 655214177.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25330716053597335, "frac_reward_zero_std": 1.0, "grad_norm": 1.2909385389034284e-10, "kl": 0.0172119140625, "learning_rate": 1.8605630380331104e-05, "loss": 0.0007, "num_tokens": 655789777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25347785269266876, "frac_reward_zero_std": 1.0, "grad_norm": 1.1611844662079748e-10, "kl": 0.01708984375, "learning_rate": 1.8602594240600993e-05, "loss": 0.0007, "num_tokens": 656353441.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25364854484936417, "frac_reward_zero_std": 1.0, "grad_norm": 1.33699222984807e-10, "kl": 0.01739501953125, "learning_rate": 1.859955504726046e-05, "loss": 0.0007, "num_tokens": 656922625.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2538192370060596, "frac_reward_zero_std": 1.0, "grad_norm": 1.17868334029583e-10, "kl": 0.0174560546875, "learning_rate": 1.8596512801388317e-05, "loss": 0.0007, "num_tokens": 657487201.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.253989929162755, "frac_reward_zero_std": 1.0, "grad_norm": 1.1888823337164757e-10, "kl": 0.017669677734375, "learning_rate": 1.8593467504064442e-05, "loss": 0.0007, "num_tokens": 658049825.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2541606213194504, "frac_reward_zero_std": 1.0, "grad_norm": 1.3303843339265086e-10, "kl": 0.01715087890625, "learning_rate": 1.859041915636981e-05, "loss": 0.0007, "num_tokens": 658617201.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2543313134761458, "frac_reward_zero_std": 1.0, "grad_norm": 1.2122478840189562e-10, "kl": 0.017486572265625, "learning_rate": 1.858736775938647e-05, "loss": 0.0007, "num_tokens": 659179233.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25450200563284114, "frac_reward_zero_std": 1.0, "grad_norm": 1.2606377078871056e-10, "kl": 0.017333984375, "learning_rate": 1.8584313314197568e-05, "loss": 0.0007, "num_tokens": 659745617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25467269778953655, "frac_reward_zero_std": 1.0, "grad_norm": 1.3964702490609816e-10, "kl": 0.017486572265625, "learning_rate": 1.8581255821887312e-05, "loss": 0.0007, "num_tokens": 660310881.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25484338994623196, "frac_reward_zero_std": 1.0, "grad_norm": 1.2796543385696852e-10, "kl": 0.018585205078125, "learning_rate": 1.8578195283541004e-05, "loss": 0.0007, "num_tokens": 660873473.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25501408210292736, "frac_reward_zero_std": 1.0, "grad_norm": 1.3480115425122349e-10, "kl": 0.017364501953125, "learning_rate": 1.8575131700245027e-05, "loss": 0.0007, "num_tokens": 661446817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25518477425962277, "frac_reward_zero_std": 1.0, "grad_norm": 1.2146673990212683e-10, "kl": 0.017425537109375, "learning_rate": 1.8572065073086843e-05, "loss": 0.0007, "num_tokens": 662009569.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2553554664163182, "frac_reward_zero_std": 1.0, "grad_norm": 1.2154670429368184e-10, "kl": 0.017059326171875, "learning_rate": 1.8568995403154992e-05, "loss": 0.0007, "num_tokens": 662574801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2555261585730136, "frac_reward_zero_std": 1.0, "grad_norm": 1.1753318111526224e-10, "kl": 0.017608642578125, "learning_rate": 1.8565922691539092e-05, "loss": 0.0007, "num_tokens": 663142785.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.255696850729709, "frac_reward_zero_std": 1.0, "grad_norm": 1.2431757829094333e-10, "kl": 0.0174560546875, "learning_rate": 1.856284693932985e-05, "loss": 0.0007, "num_tokens": 663714817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25586754288640434, "frac_reward_zero_std": 1.0, "grad_norm": 1.2135721144398085e-10, "kl": 0.017791748046875, "learning_rate": 1.8559768147619043e-05, "loss": 0.0007, "num_tokens": 664281377.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25603823504309975, "frac_reward_zero_std": 1.0, "grad_norm": 2.616107172726211e-10, "kl": 0.016845703125, "learning_rate": 1.8556686317499536e-05, "loss": 0.0007, "num_tokens": 664844337.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25620892719979516, "frac_reward_zero_std": 1.0, "grad_norm": 1.2626217821864497e-10, "kl": 0.017120361328125, "learning_rate": 1.8553601450065265e-05, "loss": 0.0007, "num_tokens": 665411505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25637961935649056, "frac_reward_zero_std": 1.0, "grad_norm": 1.2300613916936083e-10, "kl": 0.017486572265625, "learning_rate": 1.8550513546411248e-05, "loss": 0.0007, "num_tokens": 665977953.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25655031151318597, "frac_reward_zero_std": 1.0, "grad_norm": 1.225409879569374e-10, "kl": 0.0166015625, "learning_rate": 1.8547422607633574e-05, "loss": 0.0007, "num_tokens": 666546657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2567210036698814, "frac_reward_zero_std": 1.0, "grad_norm": 1.2692029730030608e-10, "kl": 0.017486572265625, "learning_rate": 1.8544328634829423e-05, "loss": 0.0007, "num_tokens": 667112209.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2568916958265768, "frac_reward_zero_std": 1.0, "grad_norm": 1.3127272584980772e-10, "kl": 0.0177001953125, "learning_rate": 1.8541231629097038e-05, "loss": 0.0007, "num_tokens": 667674065.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2570623879832722, "frac_reward_zero_std": 1.0, "grad_norm": 1.4096858482440965e-10, "kl": 0.016876220703125, "learning_rate": 1.853813159153574e-05, "loss": 0.0007, "num_tokens": 668243537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2572330801399676, "frac_reward_zero_std": 1.0, "grad_norm": 1.2576031692249394e-10, "kl": 0.016937255859375, "learning_rate": 1.853502852324594e-05, "loss": 0.0007, "num_tokens": 668810065.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25740377229666295, "frac_reward_zero_std": 1.0, "grad_norm": 1.4103726733707074e-10, "kl": 0.017547607421875, "learning_rate": 1.8531922425329112e-05, "loss": 0.0007, "num_tokens": 669378081.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25757446445335835, "frac_reward_zero_std": 1.0, "grad_norm": 1.2585764958318296e-10, "kl": 0.017578125, "learning_rate": 1.852881329888781e-05, "loss": 0.0007, "num_tokens": 669946193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25774515661005376, "frac_reward_zero_std": 1.0, "grad_norm": 1.3313847755854922e-10, "kl": 0.017333984375, "learning_rate": 1.8525701145025655e-05, "loss": 0.0007, "num_tokens": 670505713.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25791584876674917, "frac_reward_zero_std": 1.0, "grad_norm": 1.3295083653191558e-10, "kl": 0.01702880859375, "learning_rate": 1.8522585964847356e-05, "loss": 0.0007, "num_tokens": 671069089.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2580865409234446, "frac_reward_zero_std": 1.0, "grad_norm": 1.2254951742010767e-10, "kl": 0.0172119140625, "learning_rate": 1.8519467759458686e-05, "loss": 0.0007, "num_tokens": 671635969.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25825723308014, "frac_reward_zero_std": 1.0, "grad_norm": 1.1801607505666356e-10, "kl": 0.016998291015625, "learning_rate": 1.8516346529966496e-05, "loss": 0.0007, "num_tokens": 672200929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2584279252368354, "frac_reward_zero_std": 1.0, "grad_norm": 1.2056738589494352e-10, "kl": 0.01708984375, "learning_rate": 1.851322227747871e-05, "loss": 0.0007, "num_tokens": 672768785.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2585986173935308, "frac_reward_zero_std": 1.0, "grad_norm": 1.2810133312265878e-10, "kl": 0.01715087890625, "learning_rate": 1.8510095003104325e-05, "loss": 0.0007, "num_tokens": 673338497.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25876930955022615, "frac_reward_zero_std": 1.0, "grad_norm": 1.2840670069830707e-10, "kl": 0.017578125, "learning_rate": 1.8506964707953413e-05, "loss": 0.0007, "num_tokens": 673902609.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25894000170692155, "frac_reward_zero_std": 1.0, "grad_norm": 1.160498201910253e-10, "kl": 0.01715087890625, "learning_rate": 1.8503831393137114e-05, "loss": 0.0007, "num_tokens": 674464673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25911069386361696, "frac_reward_zero_std": 1.0, "grad_norm": 1.2053880057488787e-10, "kl": 0.017333984375, "learning_rate": 1.8500695059767638e-05, "loss": 0.0007, "num_tokens": 675029601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25928138602031237, "frac_reward_zero_std": 1.0, "grad_norm": 1.2513080848771418e-10, "kl": 0.01763916015625, "learning_rate": 1.8497555708958273e-05, "loss": 0.0007, "num_tokens": 675595937.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25945207817700777, "frac_reward_zero_std": 1.0, "grad_norm": 1.2719805744628945e-10, "kl": 0.01727294921875, "learning_rate": 1.849441334182338e-05, "loss": 0.0007, "num_tokens": 676164209.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2596227703337032, "frac_reward_zero_std": 1.0, "grad_norm": 1.1008485565664191e-10, "kl": 0.0172119140625, "learning_rate": 1.8491267959478377e-05, "loss": 0.0007, "num_tokens": 676729889.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2597934624903986, "frac_reward_zero_std": 1.0, "grad_norm": 1.302379850455072e-10, "kl": 0.017913818359375, "learning_rate": 1.8488119563039766e-05, "loss": 0.0007, "num_tokens": 677294513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.259964154647094, "frac_reward_zero_std": 1.0, "grad_norm": 1.126328038265e-10, "kl": 0.01763916015625, "learning_rate": 1.8484968153625118e-05, "loss": 0.0007, "num_tokens": 677860689.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26013484680378934, "frac_reward_zero_std": 1.0, "grad_norm": 1.1569991750696397e-10, "kl": 0.017852783203125, "learning_rate": 1.8481813732353064e-05, "loss": 0.0007, "num_tokens": 678422353.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26030553896048475, "frac_reward_zero_std": 1.0, "grad_norm": 1.246551289834135e-10, "kl": 0.017181396484375, "learning_rate": 1.8478656300343303e-05, "loss": 0.0007, "num_tokens": 678988449.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26047623111718016, "frac_reward_zero_std": 1.0, "grad_norm": 1.3174478133650345e-10, "kl": 0.01715087890625, "learning_rate": 1.8475495858716624e-05, "loss": 0.0007, "num_tokens": 679557025.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26064692327387556, "frac_reward_zero_std": 1.0, "grad_norm": 1.273032500220607e-10, "kl": 0.0177001953125, "learning_rate": 1.8472332408594864e-05, "loss": 0.0007, "num_tokens": 680118833.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26081761543057097, "frac_reward_zero_std": 1.0, "grad_norm": 1.2620206331892874e-10, "kl": 0.017486572265625, "learning_rate": 1.846916595110093e-05, "loss": 0.0007, "num_tokens": 680682593.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2609883075872664, "frac_reward_zero_std": 1.0, "grad_norm": 1.237320745336242e-10, "kl": 0.017059326171875, "learning_rate": 1.8465996487358805e-05, "loss": 0.0007, "num_tokens": 681244769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2611589997439618, "frac_reward_zero_std": 1.0, "grad_norm": 1.2494785292941175e-10, "kl": 0.01788330078125, "learning_rate": 1.846282401849353e-05, "loss": 0.0007, "num_tokens": 681808401.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2613296919006572, "frac_reward_zero_std": 1.0, "grad_norm": 1.4027332535225696e-10, "kl": 0.01763916015625, "learning_rate": 1.8459648545631218e-05, "loss": 0.0007, "num_tokens": 682382257.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26150038405735254, "frac_reward_zero_std": 1.0, "grad_norm": 1.1244479916589908e-10, "kl": 0.017486572265625, "learning_rate": 1.845647006989905e-05, "loss": 0.0007, "num_tokens": 682948705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26167107621404795, "frac_reward_zero_std": 1.0, "grad_norm": 1.3073190321451938e-10, "kl": 0.017425537109375, "learning_rate": 1.8453288592425267e-05, "loss": 0.0007, "num_tokens": 683517969.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26184176837074336, "frac_reward_zero_std": 1.0, "grad_norm": 1.2104861526320946e-10, "kl": 0.017669677734375, "learning_rate": 1.845010411433918e-05, "loss": 0.0007, "num_tokens": 684080001.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26201246052743876, "frac_reward_zero_std": 1.0, "grad_norm": 1.3760561970994734e-10, "kl": 0.01751708984375, "learning_rate": 1.8446916636771165e-05, "loss": 0.0007, "num_tokens": 684647505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26218315268413417, "frac_reward_zero_std": 1.0, "grad_norm": 1.2120773893085683e-10, "kl": 0.017486572265625, "learning_rate": 1.844372616085266e-05, "loss": 0.0007, "num_tokens": 685209713.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2623538448408296, "frac_reward_zero_std": 1.0, "grad_norm": 1.1309341684815115e-10, "kl": 0.01751708984375, "learning_rate": 1.844053268771617e-05, "loss": 0.0007, "num_tokens": 685771761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.262524536997525, "frac_reward_zero_std": 1.0, "grad_norm": 1.2382991598947e-10, "kl": 0.017425537109375, "learning_rate": 1.8437336218495258e-05, "loss": 0.0007, "num_tokens": 686333761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2626952291542204, "frac_reward_zero_std": 1.0, "grad_norm": 1.204950897582779e-10, "kl": 0.017364501953125, "learning_rate": 1.843413675432456e-05, "loss": 0.0007, "num_tokens": 686898625.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26286592131091574, "frac_reward_zero_std": 1.0, "grad_norm": 1.2888998769740337e-10, "kl": 0.01727294921875, "learning_rate": 1.843093429633977e-05, "loss": 0.0007, "num_tokens": 687469569.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26303661346761115, "frac_reward_zero_std": 1.0, "grad_norm": 1.1474413135684558e-10, "kl": 0.017547607421875, "learning_rate": 1.842772884567764e-05, "loss": 0.0007, "num_tokens": 688038705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26320730562430655, "frac_reward_zero_std": 1.0, "grad_norm": 1.3603113652386308e-10, "kl": 0.018157958984375, "learning_rate": 1.8424520403475997e-05, "loss": 0.0007, "num_tokens": 688608737.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26337799778100196, "frac_reward_zero_std": 1.0, "grad_norm": 1.1996708828513655e-10, "kl": 0.016754150390625, "learning_rate": 1.8421308970873722e-05, "loss": 0.0007, "num_tokens": 689171153.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26354868993769737, "frac_reward_zero_std": 1.0, "grad_norm": 1.1907157849574702e-10, "kl": 0.017120361328125, "learning_rate": 1.8418094549010747e-05, "loss": 0.0007, "num_tokens": 689735745.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2637193820943928, "frac_reward_zero_std": 1.0, "grad_norm": 1.1907807012360638e-10, "kl": 0.017242431640625, "learning_rate": 1.8414877139028087e-05, "loss": 0.0007, "num_tokens": 690297761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2638900742510882, "frac_reward_zero_std": 1.0, "grad_norm": 1.199473721391901e-10, "kl": 0.016937255859375, "learning_rate": 1.8411656742067803e-05, "loss": 0.0007, "num_tokens": 690864561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2640607664077836, "frac_reward_zero_std": 1.0, "grad_norm": 1.2380662285277814e-10, "kl": 0.017059326171875, "learning_rate": 1.8408433359273018e-05, "loss": 0.0007, "num_tokens": 691432129.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26423145856447894, "frac_reward_zero_std": 1.0, "grad_norm": 1.2664723780794458e-10, "kl": 0.017242431640625, "learning_rate": 1.8405206991787918e-05, "loss": 0.0007, "num_tokens": 691996721.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26440215072117434, "frac_reward_zero_std": 1.0, "grad_norm": 1.2571904399171516e-10, "kl": 0.01678466796875, "learning_rate": 1.8401977640757742e-05, "loss": 0.0007, "num_tokens": 692560625.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26457284287786975, "frac_reward_zero_std": 1.0, "grad_norm": 1.273709667171111e-10, "kl": 0.017303466796875, "learning_rate": 1.8398745307328802e-05, "loss": 0.0007, "num_tokens": 693122273.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26474353503456516, "frac_reward_zero_std": 1.0, "grad_norm": 1.2511236877951947e-10, "kl": 0.017242431640625, "learning_rate": 1.8395509992648456e-05, "loss": 0.0007, "num_tokens": 693689457.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26491422719126057, "frac_reward_zero_std": 1.0, "grad_norm": 1.357278489905392e-10, "kl": 0.017913818359375, "learning_rate": 1.839227169786512e-05, "loss": 0.0007, "num_tokens": 694251169.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26508491934795597, "frac_reward_zero_std": 1.0, "grad_norm": 1.3201964651560108e-10, "kl": 0.017059326171875, "learning_rate": 1.8389030424128277e-05, "loss": 0.0007, "num_tokens": 694814641.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2652556115046514, "frac_reward_zero_std": 1.0, "grad_norm": 1.369593046338234e-10, "kl": 0.01776123046875, "learning_rate": 1.838578617258846e-05, "loss": 0.0007, "num_tokens": 695384865.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2654263036613468, "frac_reward_zero_std": 1.0, "grad_norm": 1.22906804424508e-10, "kl": 0.017059326171875, "learning_rate": 1.838253894439726e-05, "loss": 0.0007, "num_tokens": 695947985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26559699581804214, "frac_reward_zero_std": 1.0, "grad_norm": 1.2855425219346913e-10, "kl": 0.01763916015625, "learning_rate": 1.8379288740707326e-05, "loss": 0.0007, "num_tokens": 696516977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26576768797473754, "frac_reward_zero_std": 1.0, "grad_norm": 1.2972210076257247e-10, "kl": 0.016693115234375, "learning_rate": 1.8376035562672366e-05, "loss": 0.0007, "num_tokens": 697084721.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26593838013143295, "frac_reward_zero_std": 1.0, "grad_norm": 1.1930907850219794e-10, "kl": 0.017852783203125, "learning_rate": 1.837277941144714e-05, "loss": 0.0007, "num_tokens": 697649921.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26610907228812836, "frac_reward_zero_std": 1.0, "grad_norm": 1.2544781298471049e-10, "kl": 0.017303466796875, "learning_rate": 1.836952028818746e-05, "loss": 0.0007, "num_tokens": 698213985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26627976444482376, "frac_reward_zero_std": 1.0, "grad_norm": 1.242342184836839e-10, "kl": 0.017730712890625, "learning_rate": 1.8366258194050202e-05, "loss": 0.0007, "num_tokens": 698776897.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26645045660151917, "frac_reward_zero_std": 1.0, "grad_norm": 1.2891994775331104e-10, "kl": 0.017822265625, "learning_rate": 1.836299313019329e-05, "loss": 0.0007, "num_tokens": 699338161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2666211487582146, "frac_reward_zero_std": 1.0, "grad_norm": 1.3031721644490679e-10, "kl": 0.017547607421875, "learning_rate": 1.83597250977757e-05, "loss": 0.0007, "num_tokens": 699904673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26679184091491, "frac_reward_zero_std": 1.0, "grad_norm": 1.2978520634455617e-10, "kl": 0.017242431640625, "learning_rate": 1.8356454097957474e-05, "loss": 0.0007, "num_tokens": 700470673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26696253307160533, "frac_reward_zero_std": 1.0, "grad_norm": 1.311080076805809e-10, "kl": 0.017547607421875, "learning_rate": 1.8353180131899687e-05, "loss": 0.0007, "num_tokens": 701033841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26713322522830074, "frac_reward_zero_std": 1.0, "grad_norm": 1.2051331767552718e-10, "kl": 0.016815185546875, "learning_rate": 1.834990320076449e-05, "loss": 0.0007, "num_tokens": 701603905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26730391738499615, "frac_reward_zero_std": 1.0, "grad_norm": 1.168833853559325e-10, "kl": 0.017333984375, "learning_rate": 1.8346623305715073e-05, "loss": 0.0007, "num_tokens": 702162353.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26747460954169155, "frac_reward_zero_std": 1.0, "grad_norm": 1.2357794883307904e-10, "kl": 0.017669677734375, "learning_rate": 1.8343340447915675e-05, "loss": 0.0007, "num_tokens": 702730513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26764530169838696, "frac_reward_zero_std": 1.0, "grad_norm": 1.2593169819690637e-10, "kl": 0.017852783203125, "learning_rate": 1.8340054628531598e-05, "loss": 0.0007, "num_tokens": 703290945.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26781599385508237, "frac_reward_zero_std": 1.0, "grad_norm": 1.2388143457504353e-10, "kl": 0.017669677734375, "learning_rate": 1.8336765848729183e-05, "loss": 0.0007, "num_tokens": 703859713.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2679866860117778, "frac_reward_zero_std": 1.0, "grad_norm": 1.185151263883216e-10, "kl": 0.0172119140625, "learning_rate": 1.8333474109675834e-05, "loss": 0.0007, "num_tokens": 704418273.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2681573781684732, "frac_reward_zero_std": 1.0, "grad_norm": 1.2326622678330032e-10, "kl": 0.01727294921875, "learning_rate": 1.8330179412539996e-05, "loss": 0.0007, "num_tokens": 704981665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26832807032516853, "frac_reward_zero_std": 1.0, "grad_norm": 1.1959469508910364e-10, "kl": 0.01739501953125, "learning_rate": 1.8326881758491174e-05, "loss": 0.0007, "num_tokens": 705545761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26849876248186394, "frac_reward_zero_std": 1.0, "grad_norm": 1.2797730339786328e-10, "kl": 0.017669677734375, "learning_rate": 1.8323581148699903e-05, "loss": 0.0007, "num_tokens": 706107761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26866945463855935, "frac_reward_zero_std": 1.0, "grad_norm": 1.395663137737582e-10, "kl": 0.01776123046875, "learning_rate": 1.8320277584337793e-05, "loss": 0.0007, "num_tokens": 706674369.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26884014679525475, "frac_reward_zero_std": 1.0, "grad_norm": 1.2092598307226607e-10, "kl": 0.017608642578125, "learning_rate": 1.8316971066577482e-05, "loss": 0.0007, "num_tokens": 707236177.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26901083895195016, "frac_reward_zero_std": 1.0, "grad_norm": 1.3405449157744176e-10, "kl": 0.017425537109375, "learning_rate": 1.8313661596592675e-05, "loss": 0.0007, "num_tokens": 707804081.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26918153110864557, "frac_reward_zero_std": 1.0, "grad_norm": 1.229616332495284e-10, "kl": 0.017364501953125, "learning_rate": 1.83103491755581e-05, "loss": 0.0007, "num_tokens": 708368113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.269352223265341, "frac_reward_zero_std": 1.0, "grad_norm": 1.2930620686755642e-10, "kl": 0.017303466796875, "learning_rate": 1.8307033804649553e-05, "loss": 0.0007, "num_tokens": 708934577.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2695229154220364, "frac_reward_zero_std": 1.0, "grad_norm": 1.3547653999651214e-10, "kl": 0.016845703125, "learning_rate": 1.830371548504388e-05, "loss": 0.0007, "num_tokens": 709500785.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26969360757873173, "frac_reward_zero_std": 1.0, "grad_norm": 1.321454993442024e-10, "kl": 0.016998291015625, "learning_rate": 1.830039421791895e-05, "loss": 0.0007, "num_tokens": 710063361.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26986429973542714, "frac_reward_zero_std": 1.0, "grad_norm": 1.1950338655592943e-10, "kl": 0.017242431640625, "learning_rate": 1.8297070004453702e-05, "loss": 0.0007, "num_tokens": 710621777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27003499189212254, "frac_reward_zero_std": 1.0, "grad_norm": 1.1574650091830146e-10, "kl": 0.0172119140625, "learning_rate": 1.829374284582811e-05, "loss": 0.0007, "num_tokens": 711182801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27020568404881795, "frac_reward_zero_std": 1.0, "grad_norm": 1.223329637489628e-10, "kl": 0.017059326171875, "learning_rate": 1.8290412743223188e-05, "loss": 0.0007, "num_tokens": 711748929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27037637620551336, "frac_reward_zero_std": 1.0, "grad_norm": 1.3316719999761604e-10, "kl": 0.017974853515625, "learning_rate": 1.8287079697821018e-05, "loss": 0.0007, "num_tokens": 712313265.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27054706836220876, "frac_reward_zero_std": 1.0, "grad_norm": 1.2058019220593783e-10, "kl": 0.017486572265625, "learning_rate": 1.82837437108047e-05, "loss": 0.0007, "num_tokens": 712884625.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27071776051890417, "frac_reward_zero_std": 1.0, "grad_norm": 1.1842794179231114e-10, "kl": 0.017364501953125, "learning_rate": 1.8280404783358387e-05, "loss": 0.0007, "num_tokens": 713447201.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2708884526755996, "frac_reward_zero_std": 1.0, "grad_norm": 1.1674195504476125e-10, "kl": 0.0167236328125, "learning_rate": 1.8277062916667283e-05, "loss": 0.0007, "num_tokens": 714020561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27105914483229493, "frac_reward_zero_std": 1.0, "grad_norm": 1.1471637509861561e-10, "kl": 0.016693115234375, "learning_rate": 1.8273718111917627e-05, "loss": 0.0007, "num_tokens": 714599361.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27122983698899034, "frac_reward_zero_std": 1.0, "grad_norm": 1.222288042464593e-10, "kl": 0.017578125, "learning_rate": 1.8270370370296707e-05, "loss": 0.0007, "num_tokens": 715160385.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27140052914568574, "frac_reward_zero_std": 1.0, "grad_norm": 1.2014059857575178e-10, "kl": 0.0177001953125, "learning_rate": 1.826701969299285e-05, "loss": 0.0007, "num_tokens": 715721729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27157122130238115, "frac_reward_zero_std": 1.0, "grad_norm": 1.2424143965036056e-10, "kl": 0.01776123046875, "learning_rate": 1.8263666081195423e-05, "loss": 0.0007, "num_tokens": 716286017.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27174191345907656, "frac_reward_zero_std": 1.0, "grad_norm": 1.1334449560681297e-10, "kl": 0.0164794921875, "learning_rate": 1.8260309536094837e-05, "loss": 0.0007, "num_tokens": 716852577.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27191260561577196, "frac_reward_zero_std": 1.0, "grad_norm": 1.1935399812637541e-10, "kl": 0.01776123046875, "learning_rate": 1.8256950058882547e-05, "loss": 0.0007, "num_tokens": 717415729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27208329777246737, "frac_reward_zero_std": 1.0, "grad_norm": 1.459991464461214e-10, "kl": 0.017547607421875, "learning_rate": 1.8253587650751045e-05, "loss": 0.0007, "num_tokens": 717985377.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2722539899291628, "frac_reward_zero_std": 1.0, "grad_norm": 1.2645160169573631e-10, "kl": 0.0179443359375, "learning_rate": 1.8250222312893866e-05, "loss": 0.0007, "num_tokens": 718547281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2724246820858581, "frac_reward_zero_std": 1.0, "grad_norm": 1.2841845377447162e-10, "kl": 0.017181396484375, "learning_rate": 1.824685404650558e-05, "loss": 0.0007, "num_tokens": 719122529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27259537424255353, "frac_reward_zero_std": 1.0, "grad_norm": 1.2315847310617934e-10, "kl": 0.0172119140625, "learning_rate": 1.82434828527818e-05, "loss": 0.0007, "num_tokens": 719684257.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27276606639924894, "frac_reward_zero_std": 1.0, "grad_norm": 1.291106446410445e-10, "kl": 0.017578125, "learning_rate": 1.8240108732919185e-05, "loss": 0.0007, "num_tokens": 720247521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27293675855594435, "frac_reward_zero_std": 1.0, "grad_norm": 1.392568012072303e-10, "kl": 0.0179443359375, "learning_rate": 1.8236731688115417e-05, "loss": 0.0007, "num_tokens": 720813489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27310745071263975, "frac_reward_zero_std": 1.0, "grad_norm": 1.2629790095771312e-10, "kl": 0.017425537109375, "learning_rate": 1.8233351719569227e-05, "loss": 0.0007, "num_tokens": 721377617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27327814286933516, "frac_reward_zero_std": 1.0, "grad_norm": 1.1963952553637476e-10, "kl": 0.017303466796875, "learning_rate": 1.8229968828480387e-05, "loss": 0.0007, "num_tokens": 721944081.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27344883502603057, "frac_reward_zero_std": 1.0, "grad_norm": 1.329996691524249e-10, "kl": 0.01751708984375, "learning_rate": 1.8226583016049698e-05, "loss": 0.0007, "num_tokens": 722508849.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.273619527182726, "frac_reward_zero_std": 1.0, "grad_norm": 1.2392989562832738e-10, "kl": 0.017333984375, "learning_rate": 1.8223194283479e-05, "loss": 0.0007, "num_tokens": 723073969.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2737902193394214, "frac_reward_zero_std": 1.0, "grad_norm": 1.1938662501281527e-10, "kl": 0.017120361328125, "learning_rate": 1.8219802631971167e-05, "loss": 0.0007, "num_tokens": 723639153.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27396091149611673, "frac_reward_zero_std": 1.0, "grad_norm": 1.3814714666819136e-10, "kl": 0.0172119140625, "learning_rate": 1.8216408062730124e-05, "loss": 0.0007, "num_tokens": 724201537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27413160365281214, "frac_reward_zero_std": 1.0, "grad_norm": 1.2560327338716388e-10, "kl": 0.017242431640625, "learning_rate": 1.821301057696081e-05, "loss": 0.0007, "num_tokens": 724767857.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27430229580950755, "frac_reward_zero_std": 1.0, "grad_norm": 1.1961282389156796e-10, "kl": 0.016845703125, "learning_rate": 1.8209610175869214e-05, "loss": 0.0007, "num_tokens": 725330721.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27447298796620295, "frac_reward_zero_std": 1.0, "grad_norm": 1.2928458727954138e-10, "kl": 0.017242431640625, "learning_rate": 1.820620686066236e-05, "loss": 0.0007, "num_tokens": 725894145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27464368012289836, "frac_reward_zero_std": 1.0, "grad_norm": 1.19663753636338e-10, "kl": 0.016876220703125, "learning_rate": 1.8202800632548292e-05, "loss": 0.0007, "num_tokens": 726456737.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27481437227959377, "frac_reward_zero_std": 1.0, "grad_norm": 1.1907126024448213e-10, "kl": 0.017669677734375, "learning_rate": 1.8199391492736107e-05, "loss": 0.0007, "num_tokens": 727020817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2749850644362892, "frac_reward_zero_std": 1.0, "grad_norm": 1.1971098430368604e-10, "kl": 0.016693115234375, "learning_rate": 1.819597944243592e-05, "loss": 0.0007, "num_tokens": 727588033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2751557565929846, "frac_reward_zero_std": 1.0, "grad_norm": 1.2130374483903396e-10, "kl": 0.01788330078125, "learning_rate": 1.819256448285889e-05, "loss": 0.0007, "num_tokens": 728158113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27532644874967993, "frac_reward_zero_std": 1.0, "grad_norm": 1.187898303433062e-10, "kl": 0.01739501953125, "learning_rate": 1.8189146615217205e-05, "loss": 0.0007, "num_tokens": 728743089.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27549714090637534, "frac_reward_zero_std": 1.0, "grad_norm": 1.272115788399772e-10, "kl": 0.016937255859375, "learning_rate": 1.8185725840724083e-05, "loss": 0.0007, "num_tokens": 729310481.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27566783306307074, "frac_reward_zero_std": 1.0, "grad_norm": 1.1251257795536145e-10, "kl": 0.0167236328125, "learning_rate": 1.8182302160593773e-05, "loss": 0.0007, "num_tokens": 729873569.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27583852521976615, "frac_reward_zero_std": 1.0, "grad_norm": 1.187482773931095e-10, "kl": 0.017547607421875, "learning_rate": 1.8178875576041562e-05, "loss": 0.0007, "num_tokens": 730439905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27600921737646156, "frac_reward_zero_std": 1.0, "grad_norm": 1.2187359349689534e-10, "kl": 0.017852783203125, "learning_rate": 1.8175446088283762e-05, "loss": 0.0007, "num_tokens": 731004161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27617990953315696, "frac_reward_zero_std": 1.0, "grad_norm": 1.2625186074298504e-10, "kl": 0.017547607421875, "learning_rate": 1.8172013698537714e-05, "loss": 0.0007, "num_tokens": 731579489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27635060168985237, "frac_reward_zero_std": 1.0, "grad_norm": 1.1610798788868818e-10, "kl": 0.017303466796875, "learning_rate": 1.81685784080218e-05, "loss": 0.0007, "num_tokens": 732140881.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2765212938465478, "frac_reward_zero_std": 1.0, "grad_norm": 1.443559060167871e-10, "kl": 0.017425537109375, "learning_rate": 1.8165140217955417e-05, "loss": 0.0007, "num_tokens": 732707073.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27669198600324313, "frac_reward_zero_std": 1.0, "grad_norm": 1.1171031239891794e-10, "kl": 0.01788330078125, "learning_rate": 1.8161699129559002e-05, "loss": 0.0007, "num_tokens": 733272257.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27686267815993854, "frac_reward_zero_std": 1.0, "grad_norm": 1.3815382985914338e-10, "kl": 0.017791748046875, "learning_rate": 1.8158255144054018e-05, "loss": 0.0007, "num_tokens": 733838321.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27703337031663394, "frac_reward_zero_std": 1.0, "grad_norm": 1.1183315521673745e-10, "kl": 0.018218994140625, "learning_rate": 1.815480826266295e-05, "loss": 0.0007, "num_tokens": 734403089.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27720406247332935, "frac_reward_zero_std": 1.0, "grad_norm": 1.2902222577872998e-10, "kl": 0.0169677734375, "learning_rate": 1.8151358486609324e-05, "loss": 0.0007, "num_tokens": 734972353.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27737475463002476, "frac_reward_zero_std": 1.0, "grad_norm": 1.1685089243589866e-10, "kl": 0.017486572265625, "learning_rate": 1.814790581711768e-05, "loss": 0.0007, "num_tokens": 735534593.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27754544678672016, "frac_reward_zero_std": 1.0, "grad_norm": 1.2910284189687754e-10, "kl": 0.016876220703125, "learning_rate": 1.81444502554136e-05, "loss": 0.0007, "num_tokens": 736104113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27771613894341557, "frac_reward_zero_std": 1.0, "grad_norm": 1.1925400306769917e-10, "kl": 0.016754150390625, "learning_rate": 1.814099180272367e-05, "loss": 0.0007, "num_tokens": 736667057.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.277886831100111, "frac_reward_zero_std": 1.0, "grad_norm": 1.3458441014378828e-10, "kl": 0.01708984375, "learning_rate": 1.8137530460275524e-05, "loss": 0.0007, "num_tokens": 737231905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2780575232568063, "frac_reward_zero_std": 1.0, "grad_norm": 1.231813411457413e-10, "kl": 0.01715087890625, "learning_rate": 1.8134066229297812e-05, "loss": 0.0007, "num_tokens": 737802481.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27822821541350173, "frac_reward_zero_std": 1.0, "grad_norm": 1.215794532235968e-10, "kl": 0.0172119140625, "learning_rate": 1.8130599111020215e-05, "loss": 0.0007, "num_tokens": 738368385.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27839890757019714, "frac_reward_zero_std": 1.0, "grad_norm": 1.2733394525138938e-10, "kl": 0.01678466796875, "learning_rate": 1.812712910667343e-05, "loss": 0.0007, "num_tokens": 738940065.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27856959972689255, "frac_reward_zero_std": 1.0, "grad_norm": 1.14139970252374e-10, "kl": 0.01690673828125, "learning_rate": 1.8123656217489184e-05, "loss": 0.0007, "num_tokens": 739506177.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27874029188358795, "frac_reward_zero_std": 1.0, "grad_norm": 1.1592883303726228e-10, "kl": 0.01715087890625, "learning_rate": 1.8120180444700227e-05, "loss": 0.0007, "num_tokens": 740070561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27891098404028336, "frac_reward_zero_std": 1.0, "grad_norm": 1.36226946747036e-10, "kl": 0.01751708984375, "learning_rate": 1.8116701789540337e-05, "loss": 0.0007, "num_tokens": 740636385.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27908167619697877, "frac_reward_zero_std": 1.0, "grad_norm": 1.363202262578284e-10, "kl": 0.016632080078125, "learning_rate": 1.8113220253244304e-05, "loss": 0.0007, "num_tokens": 741203697.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2792523683536742, "frac_reward_zero_std": 1.0, "grad_norm": 1.1684002302823196e-10, "kl": 0.017913818359375, "learning_rate": 1.8109735837047956e-05, "loss": 0.0007, "num_tokens": 741763665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2794230605103695, "frac_reward_zero_std": 1.0, "grad_norm": 1.176153114501383e-10, "kl": 0.01702880859375, "learning_rate": 1.8106248542188125e-05, "loss": 0.0007, "num_tokens": 742330529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27959375266706493, "frac_reward_zero_std": 1.0, "grad_norm": 1.332207855606575e-10, "kl": 0.01715087890625, "learning_rate": 1.8102758369902684e-05, "loss": 0.0007, "num_tokens": 742898961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27976444482376034, "frac_reward_zero_std": 1.0, "grad_norm": 1.199613245129406e-10, "kl": 0.017730712890625, "learning_rate": 1.8099265321430513e-05, "loss": 0.0007, "num_tokens": 743462017.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27993513698045575, "frac_reward_zero_std": 1.0, "grad_norm": 1.2964370899994281e-10, "kl": 0.017822265625, "learning_rate": 1.809576939801152e-05, "loss": 0.0007, "num_tokens": 744025665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28010582913715115, "frac_reward_zero_std": 1.0, "grad_norm": 1.2652667323011623e-10, "kl": 0.016876220703125, "learning_rate": 1.809227060088663e-05, "loss": 0.0007, "num_tokens": 744589601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28027652129384656, "frac_reward_zero_std": 1.0, "grad_norm": 1.184432523021282e-10, "kl": 0.017333984375, "learning_rate": 1.8088768931297792e-05, "loss": 0.0007, "num_tokens": 745154417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28044721345054197, "frac_reward_zero_std": 1.0, "grad_norm": 1.355242336748137e-10, "kl": 0.017578125, "learning_rate": 1.808526439048797e-05, "loss": 0.0007, "num_tokens": 745718145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2806179056072374, "frac_reward_zero_std": 1.0, "grad_norm": 1.2153842231660388e-10, "kl": 0.016815185546875, "learning_rate": 1.8081756979701156e-05, "loss": 0.0007, "num_tokens": 746290065.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2807885977639327, "frac_reward_zero_std": 1.0, "grad_norm": 1.353990088954307e-10, "kl": 0.01751708984375, "learning_rate": 1.8078246700182345e-05, "loss": 0.0007, "num_tokens": 746857617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28095928992062813, "frac_reward_zero_std": 1.0, "grad_norm": 1.2673938143406656e-10, "kl": 0.01727294921875, "learning_rate": 1.8074733553177562e-05, "loss": 0.0007, "num_tokens": 747423713.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28112998207732354, "frac_reward_zero_std": 1.0, "grad_norm": 1.1940876418483073e-10, "kl": 0.016876220703125, "learning_rate": 1.807121753993385e-05, "loss": 0.0007, "num_tokens": 747994417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28130067423401894, "frac_reward_zero_std": 1.0, "grad_norm": 1.278591240843566e-10, "kl": 0.017852783203125, "learning_rate": 1.8067698661699266e-05, "loss": 0.0007, "num_tokens": 748560593.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28147136639071435, "frac_reward_zero_std": 1.0, "grad_norm": 1.221415115763194e-10, "kl": 0.017913818359375, "learning_rate": 1.8064176919722885e-05, "loss": 0.0007, "num_tokens": 749126193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28164205854740976, "frac_reward_zero_std": 1.0, "grad_norm": 1.2560077111866953e-10, "kl": 0.017120361328125, "learning_rate": 1.8060652315254797e-05, "loss": 0.0007, "num_tokens": 749691569.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28181275070410516, "frac_reward_zero_std": 1.0, "grad_norm": 1.2604373107391642e-10, "kl": 0.017578125, "learning_rate": 1.8057124849546107e-05, "loss": 0.0007, "num_tokens": 750255729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28198344286080057, "frac_reward_zero_std": 1.0, "grad_norm": 1.325452028762358e-10, "kl": 0.017120361328125, "learning_rate": 1.8053594523848942e-05, "loss": 0.0007, "num_tokens": 750820705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2821541350174959, "frac_reward_zero_std": 1.0, "grad_norm": 1.2385053879413956e-10, "kl": 0.0177001953125, "learning_rate": 1.805006133941644e-05, "loss": 0.0007, "num_tokens": 751396769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28232482717419133, "frac_reward_zero_std": 1.0, "grad_norm": 1.3736119550296963e-10, "kl": 0.017364501953125, "learning_rate": 1.8046525297502754e-05, "loss": 0.0007, "num_tokens": 751969729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28249551933088674, "frac_reward_zero_std": 1.0, "grad_norm": 1.2025391683711566e-10, "kl": 0.017425537109375, "learning_rate": 1.8042986399363053e-05, "loss": 0.0007, "num_tokens": 752535617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28266621148758214, "frac_reward_zero_std": 1.0, "grad_norm": 1.2830860995508208e-10, "kl": 0.017486572265625, "learning_rate": 1.803944464625351e-05, "loss": 0.0007, "num_tokens": 753104097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28283690364427755, "frac_reward_zero_std": 1.0, "grad_norm": 1.2815090583024456e-10, "kl": 0.017608642578125, "learning_rate": 1.803590003943133e-05, "loss": 0.0007, "num_tokens": 753670049.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28300759580097296, "frac_reward_zero_std": 1.0, "grad_norm": 1.2087318720522134e-10, "kl": 0.016937255859375, "learning_rate": 1.8032352580154708e-05, "loss": 0.0007, "num_tokens": 754238033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28317828795766836, "frac_reward_zero_std": 1.0, "grad_norm": 1.1823450017174224e-10, "kl": 0.017486572265625, "learning_rate": 1.8028802269682878e-05, "loss": 0.0007, "num_tokens": 754806113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28334898011436377, "frac_reward_zero_std": 1.0, "grad_norm": 1.2622863301845905e-10, "kl": 0.0172119140625, "learning_rate": 1.802524910927606e-05, "loss": 0.0007, "num_tokens": 755374001.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2835196722710591, "frac_reward_zero_std": 1.0, "grad_norm": 1.3550870561948776e-10, "kl": 0.017791748046875, "learning_rate": 1.802169310019551e-05, "loss": 0.0007, "num_tokens": 755938097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2836903644277545, "frac_reward_zero_std": 1.0, "grad_norm": 1.2025734795173512e-10, "kl": 0.01708984375, "learning_rate": 1.801813424370347e-05, "loss": 0.0007, "num_tokens": 756502913.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28386105658444993, "frac_reward_zero_std": 1.0, "grad_norm": 1.241857291550288e-10, "kl": 0.01715087890625, "learning_rate": 1.8014572541063212e-05, "loss": 0.0007, "num_tokens": 757072369.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28403174874114534, "frac_reward_zero_std": 1.0, "grad_norm": 1.3149897839113802e-10, "kl": 0.01739501953125, "learning_rate": 1.8011007993539015e-05, "loss": 0.0007, "num_tokens": 757639601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28420244089784075, "frac_reward_zero_std": 1.0, "grad_norm": 1.370658982401212e-10, "kl": 0.018218994140625, "learning_rate": 1.800744060239616e-05, "loss": 0.0007, "num_tokens": 758216737.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28437313305453615, "frac_reward_zero_std": 1.0, "grad_norm": 1.2552599611949838e-10, "kl": 0.017547607421875, "learning_rate": 1.800387036890094e-05, "loss": 0.0007, "num_tokens": 758786225.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28454382521123156, "frac_reward_zero_std": 1.0, "grad_norm": 1.2605817090151279e-10, "kl": 0.01751708984375, "learning_rate": 1.8000297294320662e-05, "loss": 0.0007, "num_tokens": 759351617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28471451736792697, "frac_reward_zero_std": 1.0, "grad_norm": 1.2866887815635734e-10, "kl": 0.01763916015625, "learning_rate": 1.7996721379923643e-05, "loss": 0.0007, "num_tokens": 759924017.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2848852095246223, "frac_reward_zero_std": 1.0, "grad_norm": 1.2495954094698986e-10, "kl": 0.017547607421875, "learning_rate": 1.79931426269792e-05, "loss": 0.0007, "num_tokens": 760486977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2850559016813177, "frac_reward_zero_std": 1.0, "grad_norm": 1.216763262107073e-10, "kl": 0.01708984375, "learning_rate": 1.7989561036757653e-05, "loss": 0.0007, "num_tokens": 761054849.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28522659383801313, "frac_reward_zero_std": 1.0, "grad_norm": 1.363796593073911e-10, "kl": 0.01739501953125, "learning_rate": 1.7985976610530352e-05, "loss": 0.0007, "num_tokens": 761623361.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28539728599470854, "frac_reward_zero_std": 1.0, "grad_norm": 1.156595872303579e-10, "kl": 0.017669677734375, "learning_rate": 1.7982389349569626e-05, "loss": 0.0007, "num_tokens": 762188001.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28556797815140395, "frac_reward_zero_std": 1.0, "grad_norm": 1.4774107200400322e-10, "kl": 0.01727294921875, "learning_rate": 1.797879925514883e-05, "loss": 0.0007, "num_tokens": 762755473.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28573867030809935, "frac_reward_zero_std": 1.0, "grad_norm": 1.1677769559912537e-10, "kl": 0.017059326171875, "learning_rate": 1.7975206328542318e-05, "loss": 0.0007, "num_tokens": 763317521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28590936246479476, "frac_reward_zero_std": 1.0, "grad_norm": 1.1942140079076245e-10, "kl": 0.0174560546875, "learning_rate": 1.797161057102545e-05, "loss": 0.0007, "num_tokens": 763884737.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28608005462149017, "frac_reward_zero_std": 1.0, "grad_norm": 1.2103525254484642e-10, "kl": 0.017425537109375, "learning_rate": 1.7968011983874584e-05, "loss": 0.0007, "num_tokens": 764450305.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2862507467781855, "frac_reward_zero_std": 1.0, "grad_norm": 1.109183636009441e-10, "kl": 0.01776123046875, "learning_rate": 1.7964410568367094e-05, "loss": 0.0007, "num_tokens": 765015153.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2864214389348809, "frac_reward_zero_std": 1.0, "grad_norm": 1.216090939193146e-10, "kl": 0.017669677734375, "learning_rate": 1.796080632578135e-05, "loss": 0.0007, "num_tokens": 765575601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28659213109157633, "frac_reward_zero_std": 1.0, "grad_norm": 1.3017236812639702e-10, "kl": 0.017547607421875, "learning_rate": 1.795719925739673e-05, "loss": 0.0007, "num_tokens": 766140481.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28676282324827174, "frac_reward_zero_std": 1.0, "grad_norm": 1.2183162649103825e-10, "kl": 0.017059326171875, "learning_rate": 1.7953589364493612e-05, "loss": 0.0007, "num_tokens": 766703585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28693351540496714, "frac_reward_zero_std": 1.0, "grad_norm": 1.3661062417034808e-10, "kl": 0.01751708984375, "learning_rate": 1.7949976648353377e-05, "loss": 0.0007, "num_tokens": 767269009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28710420756166255, "frac_reward_zero_std": 1.0, "grad_norm": 1.2164442502135852e-10, "kl": 0.01739501953125, "learning_rate": 1.794636111025841e-05, "loss": 0.0007, "num_tokens": 767832465.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28727489971835796, "frac_reward_zero_std": 1.0, "grad_norm": 1.1690618776869547e-10, "kl": 0.016845703125, "learning_rate": 1.7942742751492095e-05, "loss": 0.0007, "num_tokens": 768392353.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28744559187505336, "frac_reward_zero_std": 1.0, "grad_norm": 1.2973676425633547e-10, "kl": 0.018310546875, "learning_rate": 1.793912157333882e-05, "loss": 0.0007, "num_tokens": 768954305.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2876162840317487, "frac_reward_zero_std": 1.0, "grad_norm": 1.2362714157938947e-10, "kl": 0.017303466796875, "learning_rate": 1.7935497577083974e-05, "loss": 0.0007, "num_tokens": 769520145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2877869761884441, "frac_reward_zero_std": 1.0, "grad_norm": 1.162137459528216e-10, "kl": 0.01715087890625, "learning_rate": 1.7931870764013942e-05, "loss": 0.0007, "num_tokens": 770089089.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28795766834513953, "frac_reward_zero_std": 1.0, "grad_norm": 1.1121645392814983e-10, "kl": 0.016876220703125, "learning_rate": 1.7928241135416114e-05, "loss": 0.0007, "num_tokens": 770652177.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28812836050183493, "frac_reward_zero_std": 1.0, "grad_norm": 1.341549599398956e-10, "kl": 0.0172119140625, "learning_rate": 1.7924608692578874e-05, "loss": 0.0007, "num_tokens": 771220625.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28829905265853034, "frac_reward_zero_std": 1.0, "grad_norm": 1.2034911817378757e-10, "kl": 0.0172119140625, "learning_rate": 1.7920973436791607e-05, "loss": 0.0007, "num_tokens": 771785233.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28846974481522575, "frac_reward_zero_std": 1.0, "grad_norm": 1.1821627021214467e-10, "kl": 0.017608642578125, "learning_rate": 1.791733536934471e-05, "loss": 0.0007, "num_tokens": 772349969.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28864043697192115, "frac_reward_zero_std": 1.0, "grad_norm": 1.2826005000975638e-10, "kl": 0.017974853515625, "learning_rate": 1.791369449152955e-05, "loss": 0.0007, "num_tokens": 772914913.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28881112912861656, "frac_reward_zero_std": 1.0, "grad_norm": 1.2021737033990554e-10, "kl": 0.017852783203125, "learning_rate": 1.7910050804638514e-05, "loss": 0.0007, "num_tokens": 773477233.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2889818212853119, "frac_reward_zero_std": 1.0, "grad_norm": 1.2120079509600406e-10, "kl": 0.017425537109375, "learning_rate": 1.790640430996498e-05, "loss": 0.0007, "num_tokens": 774040321.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2891525134420073, "frac_reward_zero_std": 1.0, "grad_norm": 1.2609242356905175e-10, "kl": 0.018341064453125, "learning_rate": 1.790275500880332e-05, "loss": 0.0007, "num_tokens": 774606801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2893232055987027, "frac_reward_zero_std": 1.0, "grad_norm": 1.3481889929041088e-10, "kl": 0.017333984375, "learning_rate": 1.7899102902448904e-05, "loss": 0.0007, "num_tokens": 775169249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28949389775539813, "frac_reward_zero_std": 1.0, "grad_norm": 1.319510739792459e-10, "kl": 0.017364501953125, "learning_rate": 1.7895447992198098e-05, "loss": 0.0007, "num_tokens": 775744929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28966458991209354, "frac_reward_zero_std": 1.0, "grad_norm": 1.2229630124070202e-10, "kl": 0.017486572265625, "learning_rate": 1.7891790279348267e-05, "loss": 0.0007, "num_tokens": 776310833.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28983528206878895, "frac_reward_zero_std": 1.0, "grad_norm": 1.273626899394847e-10, "kl": 0.01739501953125, "learning_rate": 1.7888129765197762e-05, "loss": 0.0007, "num_tokens": 776875057.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29000597422548435, "frac_reward_zero_std": 1.0, "grad_norm": 1.2980106124872434e-10, "kl": 0.01715087890625, "learning_rate": 1.7884466451045937e-05, "loss": 0.0007, "num_tokens": 777440849.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29017666638217976, "frac_reward_zero_std": 1.0, "grad_norm": 1.1421327468868542e-10, "kl": 0.0179443359375, "learning_rate": 1.7880800338193134e-05, "loss": 0.0007, "num_tokens": 778008529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2903473585388751, "frac_reward_zero_std": 1.0, "grad_norm": 1.1577345944012705e-10, "kl": 0.016998291015625, "learning_rate": 1.7877131427940693e-05, "loss": 0.0007, "num_tokens": 778573905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2905180506955705, "frac_reward_zero_std": 1.0, "grad_norm": 1.250533609873857e-10, "kl": 0.01739501953125, "learning_rate": 1.787345972159094e-05, "loss": 0.0007, "num_tokens": 779141809.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2906887428522659, "frac_reward_zero_std": 1.0, "grad_norm": 1.271649368615778e-10, "kl": 0.017425537109375, "learning_rate": 1.7869785220447204e-05, "loss": 0.0007, "num_tokens": 779711905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29085943500896133, "frac_reward_zero_std": 1.0, "grad_norm": 1.234309510917888e-10, "kl": 0.017669677734375, "learning_rate": 1.78661079258138e-05, "loss": 0.0007, "num_tokens": 780278065.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29103012716565674, "frac_reward_zero_std": 1.0, "grad_norm": 1.1255333831447162e-10, "kl": 0.01727294921875, "learning_rate": 1.7862427838996025e-05, "loss": 0.0007, "num_tokens": 780845153.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29120081932235214, "frac_reward_zero_std": 1.0, "grad_norm": 1.1522282347495718e-10, "kl": 0.017364501953125, "learning_rate": 1.785874496130019e-05, "loss": 0.0007, "num_tokens": 781407057.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29137151147904755, "frac_reward_zero_std": 1.0, "grad_norm": 1.230719082145615e-10, "kl": 0.017181396484375, "learning_rate": 1.7855059294033575e-05, "loss": 0.0007, "num_tokens": 781967489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29154220363574296, "frac_reward_zero_std": 1.0, "grad_norm": 1.1151363876262994e-10, "kl": 0.017181396484375, "learning_rate": 1.785137083850447e-05, "loss": 0.0007, "num_tokens": 782528241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29171289579243836, "frac_reward_zero_std": 1.0, "grad_norm": 1.2185021446318281e-10, "kl": 0.017578125, "learning_rate": 1.7847679596022124e-05, "loss": 0.0007, "num_tokens": 783090833.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2918835879491337, "frac_reward_zero_std": 1.0, "grad_norm": 1.2221995075511977e-10, "kl": 0.016937255859375, "learning_rate": 1.7843985567896817e-05, "loss": 0.0007, "num_tokens": 783656945.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2920542801058291, "frac_reward_zero_std": 1.0, "grad_norm": 1.1907006018449123e-10, "kl": 0.017181396484375, "learning_rate": 1.7840288755439778e-05, "loss": 0.0007, "num_tokens": 784224961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29222497226252453, "frac_reward_zero_std": 1.0, "grad_norm": 1.2063020713022658e-10, "kl": 0.017303466796875, "learning_rate": 1.783658915996325e-05, "loss": 0.0007, "num_tokens": 784789825.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29239566441921994, "frac_reward_zero_std": 1.0, "grad_norm": 1.211649034992143e-10, "kl": 0.017425537109375, "learning_rate": 1.7832886782780455e-05, "loss": 0.0007, "num_tokens": 785368305.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29256635657591534, "frac_reward_zero_std": 1.0, "grad_norm": 1.4026141692616255e-10, "kl": 0.017303466796875, "learning_rate": 1.7829181625205606e-05, "loss": 0.0007, "num_tokens": 785938641.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29273704873261075, "frac_reward_zero_std": 1.0, "grad_norm": 1.1716449282009454e-10, "kl": 0.017913818359375, "learning_rate": 1.7825473688553893e-05, "loss": 0.0007, "num_tokens": 786500609.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29290774088930616, "frac_reward_zero_std": 1.0, "grad_norm": 1.2427468264493236e-10, "kl": 0.017578125, "learning_rate": 1.7821762974141505e-05, "loss": 0.0007, "num_tokens": 787065585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29307843304600156, "frac_reward_zero_std": 1.0, "grad_norm": 1.2618050000233748e-10, "kl": 0.0172119140625, "learning_rate": 1.7818049483285614e-05, "loss": 0.0007, "num_tokens": 787629553.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2932491252026969, "frac_reward_zero_std": 1.0, "grad_norm": 1.2329788047520727e-10, "kl": 0.016448974609375, "learning_rate": 1.7814333217304368e-05, "loss": 0.0007, "num_tokens": 788198481.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2934198173593923, "frac_reward_zero_std": 1.0, "grad_norm": 1.1931326133559549e-10, "kl": 0.017974853515625, "learning_rate": 1.7810614177516913e-05, "loss": 0.0007, "num_tokens": 788759553.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2935905095160877, "frac_reward_zero_std": 1.0, "grad_norm": 1.3908524212192036e-10, "kl": 0.017242431640625, "learning_rate": 1.7806892365243374e-05, "loss": 0.0007, "num_tokens": 789325201.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29376120167278313, "frac_reward_zero_std": 1.0, "grad_norm": 1.385400731909642e-10, "kl": 0.017425537109375, "learning_rate": 1.7803167781804857e-05, "loss": 0.0007, "num_tokens": 789889313.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29393189382947854, "frac_reward_zero_std": 1.0, "grad_norm": 1.115584122173964e-10, "kl": 0.01727294921875, "learning_rate": 1.7799440428523452e-05, "loss": 0.0007, "num_tokens": 790449793.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29410258598617395, "frac_reward_zero_std": 1.0, "grad_norm": 1.1546485993869942e-10, "kl": 0.016937255859375, "learning_rate": 1.7795710306722242e-05, "loss": 0.0007, "num_tokens": 791013281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29427327814286935, "frac_reward_zero_std": 1.0, "grad_norm": 1.2185332437537715e-10, "kl": 0.01739501953125, "learning_rate": 1.7791977417725278e-05, "loss": 0.0007, "num_tokens": 791580033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29444397029956476, "frac_reward_zero_std": 1.0, "grad_norm": 1.273617893829916e-10, "kl": 0.01788330078125, "learning_rate": 1.778824176285761e-05, "loss": 0.0007, "num_tokens": 792142145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2946146624562601, "frac_reward_zero_std": 1.0, "grad_norm": 1.2080146399513158e-10, "kl": 0.01788330078125, "learning_rate": 1.7784503343445254e-05, "loss": 0.0007, "num_tokens": 792706705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2947853546129555, "frac_reward_zero_std": 1.0, "grad_norm": 1.2041367092819447e-10, "kl": 0.01708984375, "learning_rate": 1.7780762160815212e-05, "loss": 0.0007, "num_tokens": 793271297.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2949560467696509, "frac_reward_zero_std": 1.0, "grad_norm": 1.2752913415952719e-10, "kl": 0.01751708984375, "learning_rate": 1.7777018216295476e-05, "loss": 0.0007, "num_tokens": 793840801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29512673892634633, "frac_reward_zero_std": 1.0, "grad_norm": 1.2377020808775907e-10, "kl": 0.017120361328125, "learning_rate": 1.7773271511215008e-05, "loss": 0.0007, "num_tokens": 794406417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29529743108304174, "frac_reward_zero_std": 1.0, "grad_norm": 1.2092524338891165e-10, "kl": 0.01708984375, "learning_rate": 1.776952204690375e-05, "loss": 0.0007, "num_tokens": 794972865.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29546812323973715, "frac_reward_zero_std": 1.0, "grad_norm": 1.1805067094942078e-10, "kl": 0.0177001953125, "learning_rate": 1.776576982469263e-05, "loss": 0.0007, "num_tokens": 795536417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29563881539643255, "frac_reward_zero_std": 1.0, "grad_norm": 1.209268712723528e-10, "kl": 0.0174560546875, "learning_rate": 1.776201484591355e-05, "loss": 0.0007, "num_tokens": 796102161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29580950755312796, "frac_reward_zero_std": 1.0, "grad_norm": 1.5331724209694212e-10, "kl": 0.01751708984375, "learning_rate": 1.7758257111899393e-05, "loss": 0.0007, "num_tokens": 796670993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2959801997098233, "frac_reward_zero_std": 1.0, "grad_norm": 1.2990030397529204e-10, "kl": 0.017242431640625, "learning_rate": 1.7754496623984015e-05, "loss": 0.0007, "num_tokens": 797241297.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2961508918665187, "frac_reward_zero_std": 1.0, "grad_norm": 1.1801108266412546e-10, "kl": 0.01727294921875, "learning_rate": 1.7750733383502264e-05, "loss": 0.0007, "num_tokens": 797809121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2963215840232141, "frac_reward_zero_std": 1.0, "grad_norm": 1.2807000054148952e-10, "kl": 0.01763916015625, "learning_rate": 1.774696739178994e-05, "loss": 0.0007, "num_tokens": 798380657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29649227617990953, "frac_reward_zero_std": 1.0, "grad_norm": 1.279736426452385e-10, "kl": 0.017730712890625, "learning_rate": 1.7743198650183847e-05, "loss": 0.0007, "num_tokens": 798945169.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29666296833660494, "frac_reward_zero_std": 1.0, "grad_norm": 1.2692637484650154e-10, "kl": 0.017608642578125, "learning_rate": 1.7739427160021744e-05, "loss": 0.0007, "num_tokens": 799511729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29683366049330034, "frac_reward_zero_std": 1.0, "grad_norm": 1.1439534889083138e-10, "kl": 0.0174560546875, "learning_rate": 1.7735652922642377e-05, "loss": 0.0007, "num_tokens": 800071617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29700435264999575, "frac_reward_zero_std": 1.0, "grad_norm": 1.3018626441849816e-10, "kl": 0.0184326171875, "learning_rate": 1.773187593938546e-05, "loss": 0.0007, "num_tokens": 800635377.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29717504480669116, "frac_reward_zero_std": 1.0, "grad_norm": 1.2633108966045814e-10, "kl": 0.01739501953125, "learning_rate": 1.7728096211591696e-05, "loss": 0.0007, "num_tokens": 801202657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2973457369633865, "frac_reward_zero_std": 1.0, "grad_norm": 1.3242161383556726e-10, "kl": 0.017669677734375, "learning_rate": 1.772431374060274e-05, "loss": 0.0007, "num_tokens": 801767921.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2975164291200819, "frac_reward_zero_std": 1.0, "grad_norm": 1.2244500789537842e-10, "kl": 0.0177001953125, "learning_rate": 1.772052852776124e-05, "loss": 0.0007, "num_tokens": 802333169.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2976871212767773, "frac_reward_zero_std": 1.0, "grad_norm": 1.3193015970976816e-10, "kl": 0.017669677734375, "learning_rate": 1.7716740574410804e-05, "loss": 0.0007, "num_tokens": 802899441.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29785781343347273, "frac_reward_zero_std": 1.0, "grad_norm": 1.256975472567891e-10, "kl": 0.0185546875, "learning_rate": 1.771294988189603e-05, "loss": 0.0007, "num_tokens": 803465537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29802850559016814, "frac_reward_zero_std": 1.0, "grad_norm": 1.1187945464033342e-10, "kl": 0.017120361328125, "learning_rate": 1.7709156451562462e-05, "loss": 0.0007, "num_tokens": 804030465.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29819919774686354, "frac_reward_zero_std": 1.0, "grad_norm": 1.2237888108919754e-10, "kl": 0.017669677734375, "learning_rate": 1.770536028475664e-05, "loss": 0.0007, "num_tokens": 804594849.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29836988990355895, "frac_reward_zero_std": 1.0, "grad_norm": 1.2745693917576457e-10, "kl": 0.01788330078125, "learning_rate": 1.7701561382826068e-05, "loss": 0.0007, "num_tokens": 805157217.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29854058206025436, "frac_reward_zero_std": 1.0, "grad_norm": 1.1037906226816499e-10, "kl": 0.01708984375, "learning_rate": 1.769775974711921e-05, "loss": 0.0007, "num_tokens": 805728449.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2987112742169497, "frac_reward_zero_std": 1.0, "grad_norm": 1.1539863892681043e-10, "kl": 0.017608642578125, "learning_rate": 1.769395537898552e-05, "loss": 0.0007, "num_tokens": 806303649.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2988819663736451, "frac_reward_zero_std": 1.0, "grad_norm": 1.2629867412390428e-10, "kl": 0.017730712890625, "learning_rate": 1.7690148279775402e-05, "loss": 0.0007, "num_tokens": 806871025.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2990526585303405, "frac_reward_zero_std": 1.0, "grad_norm": 1.1328235796325998e-10, "kl": 0.01776123046875, "learning_rate": 1.7686338450840246e-05, "loss": 0.0007, "num_tokens": 807430929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2992233506870359, "frac_reward_zero_std": 1.0, "grad_norm": 1.3155911217275014e-10, "kl": 0.0184326171875, "learning_rate": 1.76825258935324e-05, "loss": 0.0007, "num_tokens": 808001057.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29939404284373133, "frac_reward_zero_std": 1.0, "grad_norm": 1.131025576415212e-10, "kl": 0.0172119140625, "learning_rate": 1.7678710609205186e-05, "loss": 0.0007, "num_tokens": 808565393.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29956473500042674, "frac_reward_zero_std": 1.0, "grad_norm": 1.3113994218922876e-10, "kl": 0.017669677734375, "learning_rate": 1.7674892599212893e-05, "loss": 0.0007, "num_tokens": 809128097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29973542715712215, "frac_reward_zero_std": 1.0, "grad_norm": 1.3715473473503253e-10, "kl": 0.01678466796875, "learning_rate": 1.7671071864910772e-05, "loss": 0.0007, "num_tokens": 809694513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29990611931381755, "frac_reward_zero_std": 1.0, "grad_norm": 1.203348265288193e-10, "kl": 0.0174560546875, "learning_rate": 1.766724840765505e-05, "loss": 0.0007, "num_tokens": 810271745.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3000768114705129, "frac_reward_zero_std": 1.0, "grad_norm": 1.2056464519190436e-10, "kl": 0.01788330078125, "learning_rate": 1.7663422228802913e-05, "loss": 0.0007, "num_tokens": 810844145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3002475036272083, "frac_reward_zero_std": 1.0, "grad_norm": 1.5555091264269313e-10, "kl": 0.0172119140625, "learning_rate": 1.7659593329712522e-05, "loss": 0.0007, "num_tokens": 811418049.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3004181957839037, "frac_reward_zero_std": 1.0, "grad_norm": 1.2858167784243224e-10, "kl": 0.016876220703125, "learning_rate": 1.765576171174299e-05, "loss": 0.0007, "num_tokens": 811995505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3005888879405991, "frac_reward_zero_std": 1.0, "grad_norm": 1.182906431982792e-10, "kl": 0.01715087890625, "learning_rate": 1.7651927376254407e-05, "loss": 0.0007, "num_tokens": 812560145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30075958009729453, "frac_reward_zero_std": 1.0, "grad_norm": 1.1863635585820907e-10, "kl": 0.017242431640625, "learning_rate": 1.7648090324607827e-05, "loss": 0.0007, "num_tokens": 813121489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30093027225398994, "frac_reward_zero_std": 1.0, "grad_norm": 1.315061308496735e-10, "kl": 0.016571044921875, "learning_rate": 1.7644250558165262e-05, "loss": 0.0007, "num_tokens": 813690417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30110096441068535, "frac_reward_zero_std": 1.0, "grad_norm": 1.252704559135091e-10, "kl": 0.017333984375, "learning_rate": 1.7640408078289693e-05, "loss": 0.0007, "num_tokens": 814255393.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30127165656738075, "frac_reward_zero_std": 1.0, "grad_norm": 1.328800866264114e-10, "kl": 0.017181396484375, "learning_rate": 1.7636562886345053e-05, "loss": 0.0007, "num_tokens": 814822433.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3014423487240761, "frac_reward_zero_std": 1.0, "grad_norm": 1.2916501404028962e-10, "kl": 0.01715087890625, "learning_rate": 1.7632714983696258e-05, "loss": 0.0007, "num_tokens": 815388417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3016130408807715, "frac_reward_zero_std": 1.0, "grad_norm": 1.277135707619953e-10, "kl": 0.01715087890625, "learning_rate": 1.7628864371709165e-05, "loss": 0.0007, "num_tokens": 815954673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3017837330374669, "frac_reward_zero_std": 1.0, "grad_norm": 1.2411728677168292e-10, "kl": 0.017059326171875, "learning_rate": 1.7625011051750608e-05, "loss": 0.0007, "num_tokens": 816520977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3019544251941623, "frac_reward_zero_std": 1.0, "grad_norm": 1.344142041423026e-10, "kl": 0.018035888671875, "learning_rate": 1.7621155025188375e-05, "loss": 0.0007, "num_tokens": 817086353.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30212511735085773, "frac_reward_zero_std": 1.0, "grad_norm": 1.3672942729268777e-10, "kl": 0.0174560546875, "learning_rate": 1.7617296293391217e-05, "loss": 0.0007, "num_tokens": 817649409.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30229580950755314, "frac_reward_zero_std": 1.0, "grad_norm": 1.316174884809021e-10, "kl": 0.017303466796875, "learning_rate": 1.761343485772884e-05, "loss": 0.0007, "num_tokens": 818215697.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30246650166424854, "frac_reward_zero_std": 1.0, "grad_norm": 1.234226281978902e-10, "kl": 0.01739501953125, "learning_rate": 1.7609570719571924e-05, "loss": 0.0007, "num_tokens": 818779041.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30263719382094395, "frac_reward_zero_std": 1.0, "grad_norm": 4234447365895.8633, "kl": 290447163392.0, "learning_rate": 1.7605703880292084e-05, "loss": 11613517824.0, "num_tokens": 819379393.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3028078859776393, "frac_reward_zero_std": 1.0, "grad_norm": 1.5939248794351658e-05, "kl": 0.016632080078125, "learning_rate": 1.7601834341261922e-05, "loss": 0.0007, "num_tokens": 819944545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3029785781343347, "frac_reward_zero_std": 1.0, "grad_norm": 0.005974490051614708, "kl": 0.024505615234375, "learning_rate": 1.7597962103854977e-05, "loss": 0.001, "num_tokens": 820514289.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1101.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 642.0, "completions/min_terminated_length": 0.0, "epoch": 0.3031492702910301, "frac_reward_zero_std": 1.0, "grad_norm": 1.807978536632139, "kl": 2.4453125, "learning_rate": 1.7594087169445756e-05, "loss": 0.098, "num_tokens": 820842065.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 18.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 6.0, "completions/min_terminated_length": 0.0, "epoch": 0.3033199624477255, "frac_reward_zero_std": 1.0, "grad_norm": 12.721838407276225, "kl": 17.375, "learning_rate": 1.759020953940972e-05, "loss": 0.6967, "num_tokens": 820884065.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 4.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 2.0, "completions/min_terminated_length": 0.0, "epoch": 0.30349065460442093, "frac_reward_zero_std": 1.0, "grad_norm": 11.865916649878997, "kl": 20.15625, "learning_rate": 1.7586329215123293e-05, "loss": 0.8066, "num_tokens": 820925073.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 10.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 5.0, "completions/min_terminated_length": 0.0, "epoch": 0.30366134676111634, "frac_reward_zero_std": 1.0, "grad_norm": 4.093407374037131, "kl": 14.21875, "learning_rate": 1.758244619796384e-05, "loss": 0.5688, "num_tokens": 820966929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 148.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 106.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 71.0, "completions/min_terminated_length": 0.0, "epoch": 0.30383203891781174, "frac_reward_zero_std": 1.0, "grad_norm": 2.2854482075341944, "kl": 6.921875, "learning_rate": 1.75785604893097e-05, "loss": 0.2764, "num_tokens": 821031073.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1267.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 937.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 740.0, "completions/min_terminated_length": 0.0, "epoch": 0.30400273107450715, "frac_reward_zero_std": 1.0, "grad_norm": 165.75904041751735, "kl": 43.25, "learning_rate": 1.7574672090540158e-05, "loss": 1.7286, "num_tokens": 821311409.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 814.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 611.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 399.0, "completions/min_terminated_length": 0.0, "epoch": 0.3041734232312025, "frac_reward_zero_std": 1.0, "grad_norm": 119.96088151947762, "kl": 28.09375, "learning_rate": 1.757078100303545e-05, "loss": 1.1246, "num_tokens": 821510417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 20.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 14.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 5.0, "completions/min_terminated_length": 0.0, "epoch": 0.3043441153878979, "frac_reward_zero_std": 1.0, "grad_norm": 4.202513340809855, "kl": 11.265625, "learning_rate": 1.756688722817678e-05, "loss": 0.4501, "num_tokens": 821547857.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 150.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 114.0, "completions/min_terminated_length": 0.0, "epoch": 0.3045148075445933, "frac_reward_zero_std": 1.0, "grad_norm": 48.72712341431069, "kl": 16.265625, "learning_rate": 1.7562990767346287e-05, "loss": 0.6513, "num_tokens": 821625265.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 72.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 27.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 13.0, "completions/min_terminated_length": 0.0, "epoch": 0.3046854997012887, "frac_reward_zero_std": 1.0, "grad_norm": 2.8387040227318896, "kl": 9.703125, "learning_rate": 1.7559091621927085e-05, "loss": 0.3888, "num_tokens": 821676545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 21.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 11.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 5.0, "completions/min_terminated_length": 0.0, "epoch": 0.3048561918579841, "frac_reward_zero_std": 1.0, "grad_norm": 5.458039608348685, "kl": 11.140625, "learning_rate": 1.7555189793303222e-05, "loss": 0.445, "num_tokens": 821718209.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1938.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 938.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 495.0, "completions/min_terminated_length": 0.0, "epoch": 0.30502688401467953, "frac_reward_zero_std": 1.0, "grad_norm": 370.9921662132998, "kl": 58.25, "learning_rate": 1.7551285282859712e-05, "loss": 2.332, "num_tokens": 822001585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 1793.76171875, "completions/mean_terminated_length": 195.0, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.30519757617137494, "frac_reward_zero_std": 1.0, "grad_norm": 49.21737077867638, "kl": 13.796875, "learning_rate": 1.7547378091982507e-05, "loss": 0.552, "num_tokens": 822505124.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30536826832807035, "frac_reward_zero_std": 1.0, "grad_norm": 7.842588141227632, "kl": 3.21484375, "learning_rate": 1.7543468222058528e-05, "loss": 0.1287, "num_tokens": 823065300.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2029.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 1899.0, "completions/min_terminated_length": 0.0, "epoch": 0.3055389604847657, "frac_reward_zero_std": 1.0, "grad_norm": 0.9302472797930156, "kl": 1.740234375, "learning_rate": 1.7539555674475633e-05, "loss": 0.0696, "num_tokens": 823623220.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1931.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1448.0, "completions/min_terminated_length": 0.0, "epoch": 0.3057096526414611, "frac_reward_zero_std": 1.0, "grad_norm": 0.9878801218932076, "kl": 1.748046875, "learning_rate": 1.7535640450622637e-05, "loss": 0.0699, "num_tokens": 824157172.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1877.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 1560.0, "completions/min_terminated_length": 0.0, "epoch": 0.3058803447981565, "frac_reward_zero_std": 1.0, "grad_norm": 1.8973656508072898, "kl": 2.21484375, "learning_rate": 1.75317225518893e-05, "loss": 0.0885, "num_tokens": 824676596.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1771.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 1490.0, "completions/min_terminated_length": 0.0, "epoch": 0.3060510369548519, "frac_reward_zero_std": 1.0, "grad_norm": 1.3123753452120712, "kl": 1.79296875, "learning_rate": 1.7527801979666338e-05, "loss": 0.0718, "num_tokens": 825179188.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3062217291115473, "frac_reward_zero_std": 1.0, "grad_norm": 1.4669558849346838, "kl": 1.943359375, "learning_rate": 1.752387873534541e-05, "loss": 0.0778, "num_tokens": 825749476.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30639242126824273, "frac_reward_zero_std": 1.0, "grad_norm": 0.19252903930734816, "kl": 0.373046875, "learning_rate": 1.751995282031913e-05, "loss": 0.0149, "num_tokens": 826314804.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30656311342493814, "frac_reward_zero_std": 1.0, "grad_norm": 0.027033301757032974, "kl": 0.09515380859375, "learning_rate": 1.7516024235981053e-05, "loss": 0.0038, "num_tokens": 826875956.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30673380558163355, "frac_reward_zero_std": 1.0, "grad_norm": 0.004342634829590754, "kl": 0.0330810546875, "learning_rate": 1.7512092983725684e-05, "loss": 0.0013, "num_tokens": 827439380.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3069044977383289, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010973138627509023, "kl": 0.028167724609375, "learning_rate": 1.7508159064948475e-05, "loss": 0.0011, "num_tokens": 828005972.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3070751898950243, "frac_reward_zero_std": 1.0, "grad_norm": 0.006075459457386537, "kl": 0.028900146484375, "learning_rate": 1.7504222481045828e-05, "loss": 0.0012, "num_tokens": 828568676.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3072458820517197, "frac_reward_zero_std": 1.0, "grad_norm": 0.002467741929821629, "kl": 0.0283203125, "learning_rate": 1.7500283233415086e-05, "loss": 0.0011, "num_tokens": 829130740.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3074165742084151, "frac_reward_zero_std": 1.0, "grad_norm": 0.009251678494138096, "kl": 0.0301513671875, "learning_rate": 1.7496341323454543e-05, "loss": 0.0012, "num_tokens": 829705540.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3075872663651105, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026838880337698746, "kl": 0.027862548828125, "learning_rate": 1.7492396752563435e-05, "loss": 0.0011, "num_tokens": 830270964.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30775795852180593, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007410883266052873, "kl": 0.02764892578125, "learning_rate": 1.7488449522141935e-05, "loss": 0.0011, "num_tokens": 830836708.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30792865067850134, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006347900781251771, "kl": 0.02752685546875, "learning_rate": 1.7484499633591174e-05, "loss": 0.0011, "num_tokens": 831408564.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30809934283519674, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029112459420931472, "kl": 0.0283203125, "learning_rate": 1.748054708831322e-05, "loss": 0.0011, "num_tokens": 831975044.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30827003499189215, "frac_reward_zero_std": 1.0, "grad_norm": 0.00868852849199994, "kl": 0.0286865234375, "learning_rate": 1.7476591887711078e-05, "loss": 0.0011, "num_tokens": 832542660.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3084407271485875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014397832884623031, "kl": 0.029144287109375, "learning_rate": 1.7472634033188708e-05, "loss": 0.0012, "num_tokens": 833110948.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3086114193052829, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007982735044084984, "kl": 0.03192138671875, "learning_rate": 1.7468673526151002e-05, "loss": 0.0013, "num_tokens": 833678820.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3087821114619783, "frac_reward_zero_std": 1.0, "grad_norm": 0.002610591800078079, "kl": 0.030975341796875, "learning_rate": 1.7464710368003797e-05, "loss": 0.0012, "num_tokens": 834244468.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3089528036186737, "frac_reward_zero_std": 1.0, "grad_norm": 0.009387290067923652, "kl": 0.03076171875, "learning_rate": 1.7460744560153873e-05, "loss": 0.0012, "num_tokens": 834807924.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30912349577536913, "frac_reward_zero_std": 1.0, "grad_norm": 0.007020744627474684, "kl": 0.03265380859375, "learning_rate": 1.7456776104008947e-05, "loss": 0.0013, "num_tokens": 835376148.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30929418793206453, "frac_reward_zero_std": 1.0, "grad_norm": 0.006834727524387076, "kl": 0.03240966796875, "learning_rate": 1.745280500097768e-05, "loss": 0.0013, "num_tokens": 835942836.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30946488008875994, "frac_reward_zero_std": 1.0, "grad_norm": 0.009552877548341575, "kl": 0.030792236328125, "learning_rate": 1.7448831252469665e-05, "loss": 0.0012, "num_tokens": 836505860.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30963557224545535, "frac_reward_zero_std": 1.0, "grad_norm": 0.006340691684507787, "kl": 0.03057861328125, "learning_rate": 1.7444854859895445e-05, "loss": 0.0012, "num_tokens": 837074836.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3098062644021507, "frac_reward_zero_std": 1.0, "grad_norm": 0.0044128452423180045, "kl": 0.0291748046875, "learning_rate": 1.7440875824666496e-05, "loss": 0.0012, "num_tokens": 837637764.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3099769565588461, "frac_reward_zero_std": 1.0, "grad_norm": 0.001983254840103407, "kl": 0.028350830078125, "learning_rate": 1.7436894148195227e-05, "loss": 0.0011, "num_tokens": 838207428.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3101476487155415, "frac_reward_zero_std": 1.0, "grad_norm": 0.00206783164425747, "kl": 0.027435302734375, "learning_rate": 1.743290983189499e-05, "loss": 0.0011, "num_tokens": 838768884.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3103183408722369, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007840703339567857, "kl": 0.0272216796875, "learning_rate": 1.742892287718008e-05, "loss": 0.0011, "num_tokens": 839335220.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3104890330289323, "frac_reward_zero_std": 1.0, "grad_norm": 0.002516096598901855, "kl": 0.0267333984375, "learning_rate": 1.742493328546571e-05, "loss": 0.0011, "num_tokens": 839900692.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31065972518562773, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005015570528204482, "kl": 0.027008056640625, "learning_rate": 1.7420941058168056e-05, "loss": 0.0011, "num_tokens": 840463124.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31083041734232314, "frac_reward_zero_std": 1.0, "grad_norm": 0.006100590378901852, "kl": 0.025726318359375, "learning_rate": 1.7416946196704203e-05, "loss": 0.001, "num_tokens": 841028900.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31100110949901855, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017480845058510058, "kl": 0.0267333984375, "learning_rate": 1.7412948702492184e-05, "loss": 0.0011, "num_tokens": 841598996.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3111718016557139, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023122686177618717, "kl": 0.026702880859375, "learning_rate": 1.7408948576950973e-05, "loss": 0.0011, "num_tokens": 842164356.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3113424938124093, "frac_reward_zero_std": 1.0, "grad_norm": 0.009583050106318717, "kl": 0.02630615234375, "learning_rate": 1.7404945821500464e-05, "loss": 0.0011, "num_tokens": 842728676.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3115131859691047, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014648290097667663, "kl": 0.026885986328125, "learning_rate": 1.740094043756149e-05, "loss": 0.0011, "num_tokens": 843295332.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3116838781258001, "frac_reward_zero_std": 1.0, "grad_norm": 0.003931887576614827, "kl": 0.026580810546875, "learning_rate": 1.7396932426555818e-05, "loss": 0.0011, "num_tokens": 843855780.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3118545702824955, "frac_reward_zero_std": 1.0, "grad_norm": 0.008233739003166578, "kl": 0.027008056640625, "learning_rate": 1.7392921789906153e-05, "loss": 0.0011, "num_tokens": 844421076.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31202526243919093, "frac_reward_zero_std": 1.0, "grad_norm": 0.000505999763577987, "kl": 0.0264892578125, "learning_rate": 1.7388908529036115e-05, "loss": 0.0011, "num_tokens": 844992740.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31219595459588634, "frac_reward_zero_std": 1.0, "grad_norm": 0.002256738010380225, "kl": 0.02911376953125, "learning_rate": 1.738489264537028e-05, "loss": 0.0012, "num_tokens": 845560276.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31236664675258174, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009450101400980432, "kl": 0.028594970703125, "learning_rate": 1.7380874140334132e-05, "loss": 0.0011, "num_tokens": 846128516.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3125373389092771, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007511112643877991, "kl": 0.029632568359375, "learning_rate": 1.73768530153541e-05, "loss": 0.0012, "num_tokens": 846705156.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3127080310659725, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013790532117813313, "kl": 0.030120849609375, "learning_rate": 1.7372829271857532e-05, "loss": 0.0012, "num_tokens": 847280372.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3128787232226679, "frac_reward_zero_std": 1.0, "grad_norm": 0.005849579347932245, "kl": 0.029144287109375, "learning_rate": 1.7368802911272723e-05, "loss": 0.0012, "num_tokens": 847846420.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3130494153793633, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035968144438582035, "kl": 0.02972412109375, "learning_rate": 1.7364773935028876e-05, "loss": 0.0012, "num_tokens": 848411764.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3132201075360587, "frac_reward_zero_std": 1.0, "grad_norm": 0.004297050672201357, "kl": 0.031463623046875, "learning_rate": 1.7360742344556133e-05, "loss": 0.0013, "num_tokens": 848975796.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31339079969275413, "frac_reward_zero_std": 1.0, "grad_norm": 0.005812503022884212, "kl": 0.0299072265625, "learning_rate": 1.7356708141285568e-05, "loss": 0.0012, "num_tokens": 849539764.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31356149184944954, "frac_reward_zero_std": 1.0, "grad_norm": 0.006703722850299826, "kl": 0.030487060546875, "learning_rate": 1.7352671326649174e-05, "loss": 0.0012, "num_tokens": 850106180.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31373218400614494, "frac_reward_zero_std": 1.0, "grad_norm": 0.06716689492437339, "kl": 0.036102294921875, "learning_rate": 1.7348631902079877e-05, "loss": 0.0014, "num_tokens": 850670948.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3139028761628403, "frac_reward_zero_std": 1.0, "grad_norm": 0.006686118898657083, "kl": 0.0306396484375, "learning_rate": 1.734458986901152e-05, "loss": 0.0012, "num_tokens": 851235572.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3140735683195357, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034262351247677843, "kl": 0.030517578125, "learning_rate": 1.7340545228878888e-05, "loss": 0.0012, "num_tokens": 851800564.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3142442604762311, "frac_reward_zero_std": 1.0, "grad_norm": 0.007828192229487193, "kl": 0.030303955078125, "learning_rate": 1.7336497983117678e-05, "loss": 0.0012, "num_tokens": 852370948.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3144149526329265, "frac_reward_zero_std": 1.0, "grad_norm": 0.007706220549094469, "kl": 0.0318603515625, "learning_rate": 1.7332448133164518e-05, "loss": 0.0013, "num_tokens": 852936212.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3145856447896219, "frac_reward_zero_std": 1.0, "grad_norm": 0.002261446891713485, "kl": 0.031005859375, "learning_rate": 1.7328395680456953e-05, "loss": 0.0012, "num_tokens": 853498692.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3147563369463173, "frac_reward_zero_std": 1.0, "grad_norm": 0.009179439549174, "kl": 0.0338134765625, "learning_rate": 1.732434062643346e-05, "loss": 0.0014, "num_tokens": 854075108.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31492702910301273, "frac_reward_zero_std": 1.0, "grad_norm": 0.0059830732189496355, "kl": 0.030029296875, "learning_rate": 1.7320282972533443e-05, "loss": 0.0012, "num_tokens": 854639044.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31509772125970814, "frac_reward_zero_std": 1.0, "grad_norm": 0.003991623694890075, "kl": 0.030853271484375, "learning_rate": 1.7316222720197216e-05, "loss": 0.0012, "num_tokens": 855207540.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3152684134164035, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013696650469802037, "kl": 0.030120849609375, "learning_rate": 1.7312159870866026e-05, "loss": 0.0012, "num_tokens": 855768708.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3154391055730989, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016055714082945617, "kl": 0.03155517578125, "learning_rate": 1.7308094425982032e-05, "loss": 0.0013, "num_tokens": 856334148.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3156097977297943, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025530434631186667, "kl": 0.030731201171875, "learning_rate": 1.7304026386988327e-05, "loss": 0.0012, "num_tokens": 856900468.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3157804898864897, "frac_reward_zero_std": 1.0, "grad_norm": 0.004437421797739851, "kl": 0.03076171875, "learning_rate": 1.7299955755328912e-05, "loss": 0.0012, "num_tokens": 857461284.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3159511820431851, "frac_reward_zero_std": 1.0, "grad_norm": 56.27647968194004, "kl": 4.671875, "learning_rate": 1.729588253244872e-05, "loss": 0.1866, "num_tokens": 858055236.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3161218741998805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0044992448700367045, "kl": 0.03314208984375, "learning_rate": 1.7291806719793595e-05, "loss": 0.0013, "num_tokens": 858621396.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31629256635657593, "frac_reward_zero_std": 1.0, "grad_norm": 0.00881901127321999, "kl": 0.03857421875, "learning_rate": 1.72877283188103e-05, "loss": 0.0015, "num_tokens": 859189220.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31646325851327134, "frac_reward_zero_std": 1.0, "grad_norm": 0.01710471664069451, "kl": 0.043701171875, "learning_rate": 1.7283647330946527e-05, "loss": 0.0017, "num_tokens": 859767380.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3166339506699667, "frac_reward_zero_std": 1.0, "grad_norm": 0.09015780797528487, "kl": 0.17529296875, "learning_rate": 1.7279563757650875e-05, "loss": 0.007, "num_tokens": 860334820.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3168046428266621, "frac_reward_zero_std": 1.0, "grad_norm": 1.4481956582469289, "kl": 1.490234375, "learning_rate": 1.7275477600372865e-05, "loss": 0.0596, "num_tokens": 860903684.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 901.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 559.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 189.0, "completions/min_terminated_length": 0.0, "epoch": 0.3169753349833575, "frac_reward_zero_std": 1.0, "grad_norm": 4.214248792672558, "kl": 3.01953125, "learning_rate": 1.727138886056294e-05, "loss": 0.1207, "num_tokens": 861085364.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 258.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 106.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 59.0, "completions/min_terminated_length": 0.0, "epoch": 0.3171460271400529, "frac_reward_zero_std": 1.0, "grad_norm": 3.349001277653074, "kl": 8.1640625, "learning_rate": 1.7267297539672445e-05, "loss": 0.3267, "num_tokens": 861152660.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 577.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 317.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 57.0, "completions/min_terminated_length": 0.0, "epoch": 0.3173167192967483, "frac_reward_zero_std": 1.0, "grad_norm": 2.516701241654389, "kl": 4.4140625, "learning_rate": 1.7263203639153663e-05, "loss": 0.1766, "num_tokens": 861279588.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 101.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 12.0, "completions/min_terminated_length": 0.0, "epoch": 0.3174874114534437, "frac_reward_zero_std": 1.0, "grad_norm": 12.723119637698524, "kl": 5.0625, "learning_rate": 1.7259107160459775e-05, "loss": 0.2026, "num_tokens": 861328372.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 19.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 14.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 8.0, "completions/min_terminated_length": 0.0, "epoch": 0.31765810361013913, "frac_reward_zero_std": 1.0, "grad_norm": 5.50446858715883, "kl": 5.890625, "learning_rate": 1.7255008105044884e-05, "loss": 0.2355, "num_tokens": 861379300.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 56.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 11.0, "completions/min_terminated_length": 0.0, "epoch": 0.31782879576683454, "frac_reward_zero_std": 1.0, "grad_norm": 10935.833978301198, "kl": 750.0, "learning_rate": 1.7250906474364008e-05, "loss": 29.9712, "num_tokens": 861427428.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 40.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 27.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 18.0, "completions/min_terminated_length": 0.0, "epoch": 0.3179994879235299, "frac_reward_zero_std": 1.0, "grad_norm": 7.678753318838321, "kl": 9.65625, "learning_rate": 1.724680226987307e-05, "loss": 0.3866, "num_tokens": 861471988.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 41.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 29.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 20.0, "completions/min_terminated_length": 0.0, "epoch": 0.3181701800802253, "frac_reward_zero_std": 1.0, "grad_norm": 4.909832111204574, "kl": 6.7890625, "learning_rate": 1.7242695493028927e-05, "loss": 0.2715, "num_tokens": 861522612.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 42.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 25.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 16.0, "completions/min_terminated_length": 0.0, "epoch": 0.3183408722369207, "frac_reward_zero_std": 1.0, "grad_norm": 1.8674772738217624, "kl": 4.8828125, "learning_rate": 1.7238586145289328e-05, "loss": 0.1954, "num_tokens": 861566836.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 60.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 41.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 28.0, "completions/min_terminated_length": 0.0, "epoch": 0.3185115643936161, "frac_reward_zero_std": 1.0, "grad_norm": 138.16131639330573, "kl": 18.34375, "learning_rate": 1.7234474228112947e-05, "loss": 0.7332, "num_tokens": 861620564.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 98.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 56.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 44.0, "completions/min_terminated_length": 0.0, "epoch": 0.3186822565503115, "frac_reward_zero_std": 1.0, "grad_norm": 8.061854151014812, "kl": 4.41796875, "learning_rate": 1.7230359742959362e-05, "loss": 0.1769, "num_tokens": 861677108.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 20.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 15.0, "completions/min_terminated_length": 0.0, "epoch": 0.3188529487070069, "frac_reward_zero_std": 1.0, "grad_norm": 3.8348568028565007, "kl": 3.73046875, "learning_rate": 1.722624269128907e-05, "loss": 0.1493, "num_tokens": 861722132.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 22.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 9.0, "completions/min_terminated_length": 0.0, "epoch": 0.31902364086370233, "frac_reward_zero_std": 1.0, "grad_norm": 33.769747736070265, "kl": 7.0078125, "learning_rate": 1.7222123074563475e-05, "loss": 0.2806, "num_tokens": 861768372.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 22.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 16.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 13.0, "completions/min_terminated_length": 0.0, "epoch": 0.31919433302039774, "frac_reward_zero_std": 1.0, "grad_norm": 3.124398068697236, "kl": 3.48828125, "learning_rate": 1.7218000894244894e-05, "loss": 0.1394, "num_tokens": 861840740.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 33.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 22.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 17.0, "completions/min_terminated_length": 0.0, "epoch": 0.3193650251770931, "frac_reward_zero_std": 1.0, "grad_norm": 1.6869719529009222, "kl": 3.18359375, "learning_rate": 1.721387615179655e-05, "loss": 0.1273, "num_tokens": 861886724.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 48.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 24.0, "completions/min_terminated_length": 0.0, "epoch": 0.3195357173337885, "frac_reward_zero_std": 1.0, "grad_norm": 2.6929045065423924, "kl": 3.828125, "learning_rate": 1.7209748848682575e-05, "loss": 0.1534, "num_tokens": 861944308.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 118.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 87.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 47.0, "completions/min_terminated_length": 0.0, "epoch": 0.3197064094904839, "frac_reward_zero_std": 1.0, "grad_norm": 5.657632388871566, "kl": 4.20703125, "learning_rate": 1.7205618986368016e-05, "loss": 0.1682, "num_tokens": 862008132.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 303.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 175.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 107.0, "completions/min_terminated_length": 0.0, "epoch": 0.3198771016471793, "frac_reward_zero_std": 1.0, "grad_norm": 1.9983041014290568, "kl": 3.76171875, "learning_rate": 1.720148656631883e-05, "loss": 0.1505, "num_tokens": 862095588.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 253.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 196.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 129.0, "completions/min_terminated_length": 0.0, "epoch": 0.3200477938038747, "frac_reward_zero_std": 1.0, "grad_norm": 4.594420466823394, "kl": 4.078125, "learning_rate": 1.7197351590001865e-05, "loss": 0.1631, "num_tokens": 862188020.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 400.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 302.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 205.0, "completions/min_terminated_length": 0.0, "epoch": 0.3202184859605701, "frac_reward_zero_std": 1.0, "grad_norm": 0.8452073731680785, "kl": 3.671875, "learning_rate": 1.71932140588849e-05, "loss": 0.1469, "num_tokens": 862303540.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 235.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 172.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 134.0, "completions/min_terminated_length": 0.0, "epoch": 0.3203891781172655, "frac_reward_zero_std": 1.0, "grad_norm": 2.7163741217561053, "kl": 4.25, "learning_rate": 1.71890739744366e-05, "loss": 0.1702, "num_tokens": 862390020.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 112.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 87.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 55.0, "completions/min_terminated_length": 0.0, "epoch": 0.32055987027396093, "frac_reward_zero_std": 1.0, "grad_norm": 2.2471241480971114, "kl": 4.421875, "learning_rate": 1.7184931338126554e-05, "loss": 0.1767, "num_tokens": 862450196.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 115.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 86.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 55.0, "completions/min_terminated_length": 0.0, "epoch": 0.3207305624306563, "frac_reward_zero_std": 1.0, "grad_norm": 61.96553417985389, "kl": 11.375, "learning_rate": 1.718078615142524e-05, "loss": 0.4551, "num_tokens": 862508852.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 147.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 98.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 76.0, "completions/min_terminated_length": 0.0, "epoch": 0.3209012545873517, "frac_reward_zero_std": 1.0, "grad_norm": 51.31092160541272, "kl": 9.7578125, "learning_rate": 1.717663841580406e-05, "loss": 0.3905, "num_tokens": 862577140.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 184.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 129.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 78.0, "completions/min_terminated_length": 0.0, "epoch": 0.3210719467440471, "frac_reward_zero_std": 1.0, "grad_norm": 4.093630451251194, "kl": 3.3515625, "learning_rate": 1.71724881327353e-05, "loss": 0.1341, "num_tokens": 862650436.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 176.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 127.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 96.0, "completions/min_terminated_length": 0.0, "epoch": 0.3212426389007425, "frac_reward_zero_std": 1.0, "grad_norm": 2.79835844758268, "kl": 4.4453125, "learning_rate": 1.7168335303692163e-05, "loss": 0.1779, "num_tokens": 862723668.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 166.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 105.0, "completions/min_terminated_length": 0.0, "epoch": 0.3214133310574379, "frac_reward_zero_std": 1.0, "grad_norm": 2.615125887076458, "kl": 4.7421875, "learning_rate": 1.7164179930148757e-05, "loss": 0.1898, "num_tokens": 862814420.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 265.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 229.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 204.0, "completions/min_terminated_length": 0.0, "epoch": 0.3215840232141333, "frac_reward_zero_std": 1.0, "grad_norm": 2.4284143779065888, "kl": 4.2109375, "learning_rate": 1.7160022013580088e-05, "loss": 0.1685, "num_tokens": 862930164.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 847.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 545.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 385.0, "completions/min_terminated_length": 0.0, "epoch": 0.3217547153708287, "frac_reward_zero_std": 1.0, "grad_norm": 0.5982949055114039, "kl": 2.66015625, "learning_rate": 1.7155861555462068e-05, "loss": 0.1063, "num_tokens": 863110020.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1622.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1227.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 999.0, "completions/min_terminated_length": 0.0, "epoch": 0.32192540752752413, "frac_reward_zero_std": 1.0, "grad_norm": 0.3538718040752636, "kl": 1.771484375, "learning_rate": 1.7151698557271506e-05, "loss": 0.071, "num_tokens": 863464692.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2019.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 1914.0, "completions/min_terminated_length": 0.0, "epoch": 0.3220960996842195, "frac_reward_zero_std": 1.0, "grad_norm": 16.80394554641877, "kl": 4.64453125, "learning_rate": 1.7147533020486117e-05, "loss": 0.1857, "num_tokens": 864023156.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3222667918409149, "frac_reward_zero_std": 1.0, "grad_norm": 27.058622313641933, "kl": 6.9453125, "learning_rate": 1.7143364946584517e-05, "loss": 0.2777, "num_tokens": 864587956.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3224374839976103, "frac_reward_zero_std": 1.0, "grad_norm": 6.240108729593254, "kl": 2.6875, "learning_rate": 1.7139194337046225e-05, "loss": 0.1076, "num_tokens": 865153284.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3226081761543057, "frac_reward_zero_std": 1.0, "grad_norm": 0.5978415656397382, "kl": 1.822265625, "learning_rate": 1.7135021193351647e-05, "loss": 0.0729, "num_tokens": 865717092.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3227788683110011, "frac_reward_zero_std": 1.0, "grad_norm": 1.2254320522127753, "kl": 1.7265625, "learning_rate": 1.713084551698211e-05, "loss": 0.0691, "num_tokens": 866297108.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2035.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 1951.0, "completions/min_terminated_length": 0.0, "epoch": 0.3229495604676965, "frac_reward_zero_std": 1.0, "grad_norm": 1.4729867442825912, "kl": 1.775390625, "learning_rate": 1.712666730941982e-05, "loss": 0.0711, "num_tokens": 866856772.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3231202526243919, "frac_reward_zero_std": 1.0, "grad_norm": 1.2855009043027592, "kl": 1.6640625, "learning_rate": 1.7122486572147895e-05, "loss": 0.0665, "num_tokens": 867421156.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32329094478108733, "frac_reward_zero_std": 1.0, "grad_norm": 0.5553965398281089, "kl": 1.478515625, "learning_rate": 1.7118303306650342e-05, "loss": 0.0592, "num_tokens": 867997220.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3234616369377827, "frac_reward_zero_std": 1.0, "grad_norm": 2.018348831595327, "kl": 1.6953125, "learning_rate": 1.7114117514412073e-05, "loss": 0.0679, "num_tokens": 868558740.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3236323290944781, "frac_reward_zero_std": 1.0, "grad_norm": 2.0625017440821365, "kl": 1.6171875, "learning_rate": 1.710992919691889e-05, "loss": 0.0647, "num_tokens": 869120644.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3238030212511735, "frac_reward_zero_std": 1.0, "grad_norm": 1.8309818322241804, "kl": 1.5234375, "learning_rate": 1.7105738355657498e-05, "loss": 0.0609, "num_tokens": 869687828.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3239737134078689, "frac_reward_zero_std": 1.0, "grad_norm": 1.1436749504295878, "kl": 1.05859375, "learning_rate": 1.7101544992115487e-05, "loss": 0.0424, "num_tokens": 870251220.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3241444055645643, "frac_reward_zero_std": 1.0, "grad_norm": 0.2663766341014166, "kl": 0.47119140625, "learning_rate": 1.7097349107781362e-05, "loss": 0.0189, "num_tokens": 870827300.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3243150977212597, "frac_reward_zero_std": 1.0, "grad_norm": 0.39695727966537964, "kl": 0.21630859375, "learning_rate": 1.70931507041445e-05, "loss": 0.0087, "num_tokens": 871392484.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3244857898779551, "frac_reward_zero_std": 1.0, "grad_norm": 0.29783187279342616, "kl": 0.1329345703125, "learning_rate": 1.7088949782695187e-05, "loss": 0.0053, "num_tokens": 871960708.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32465648203465053, "frac_reward_zero_std": 1.0, "grad_norm": 0.13631199798011942, "kl": 0.05645751953125, "learning_rate": 1.7084746344924595e-05, "loss": 0.0023, "num_tokens": 872530612.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32482717419134594, "frac_reward_zero_std": 1.0, "grad_norm": 0.9298509180604888, "kl": 0.103271484375, "learning_rate": 1.7080540392324797e-05, "loss": 0.0041, "num_tokens": 873098308.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3249978663480413, "frac_reward_zero_std": 1.0, "grad_norm": 0.038293794310866, "kl": 0.0447998046875, "learning_rate": 1.7076331926388753e-05, "loss": 0.0018, "num_tokens": 873664356.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3251685585047367, "frac_reward_zero_std": 1.0, "grad_norm": 0.05082697147245775, "kl": 0.049560546875, "learning_rate": 1.7072120948610314e-05, "loss": 0.002, "num_tokens": 874231700.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3253392506614321, "frac_reward_zero_std": 1.0, "grad_norm": 0.03975649335026855, "kl": 0.05078125, "learning_rate": 1.706790746048423e-05, "loss": 0.002, "num_tokens": 874798404.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3255099428181275, "frac_reward_zero_std": 1.0, "grad_norm": 0.03782397559862129, "kl": 0.05224609375, "learning_rate": 1.706369146350613e-05, "loss": 0.0021, "num_tokens": 875362100.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3256806349748229, "frac_reward_zero_std": 1.0, "grad_norm": 0.04779528678612856, "kl": 0.057373046875, "learning_rate": 1.705947295917255e-05, "loss": 0.0023, "num_tokens": 875926532.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3258513271315183, "frac_reward_zero_std": 1.0, "grad_norm": 0.022664811761328094, "kl": 0.054443359375, "learning_rate": 1.70552519489809e-05, "loss": 0.0022, "num_tokens": 876490340.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3260220192882137, "frac_reward_zero_std": 1.0, "grad_norm": 0.02138911158869845, "kl": 0.048828125, "learning_rate": 1.7051028434429487e-05, "loss": 0.002, "num_tokens": 877053636.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32619271144490913, "frac_reward_zero_std": 1.0, "grad_norm": 0.04816319325207368, "kl": 0.05438232421875, "learning_rate": 1.7046802417017508e-05, "loss": 0.0022, "num_tokens": 877617748.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3263634036016045, "frac_reward_zero_std": 1.0, "grad_norm": 0.01833461103562442, "kl": 0.048583984375, "learning_rate": 1.7042573898245045e-05, "loss": 0.0019, "num_tokens": 878181220.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3265340957582999, "frac_reward_zero_std": 1.0, "grad_norm": 0.013521505082792432, "kl": 0.04730224609375, "learning_rate": 1.703834287961307e-05, "loss": 0.0019, "num_tokens": 878750372.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3267047879149953, "frac_reward_zero_std": 1.0, "grad_norm": 0.021936825669549822, "kl": 0.048583984375, "learning_rate": 1.7034109362623446e-05, "loss": 0.0019, "num_tokens": 879312644.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3268754800716907, "frac_reward_zero_std": 1.0, "grad_norm": 0.02263487503787584, "kl": 0.04571533203125, "learning_rate": 1.7029873348778915e-05, "loss": 0.0018, "num_tokens": 879877012.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3270461722283861, "frac_reward_zero_std": 1.0, "grad_norm": 0.021752757483802843, "kl": 0.044921875, "learning_rate": 1.7025634839583104e-05, "loss": 0.0018, "num_tokens": 880442580.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3272168643850815, "frac_reward_zero_std": 1.0, "grad_norm": 0.018328848479256303, "kl": 0.05950927734375, "learning_rate": 1.7021393836540537e-05, "loss": 0.0024, "num_tokens": 881008084.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3273875565417769, "frac_reward_zero_std": 1.0, "grad_norm": 0.015261299695325165, "kl": 0.0440673828125, "learning_rate": 1.701715034115662e-05, "loss": 0.0018, "num_tokens": 881569444.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32755824869847233, "frac_reward_zero_std": 1.0, "grad_norm": 0.012609922161534495, "kl": 0.04205322265625, "learning_rate": 1.7012904354937636e-05, "loss": 0.0017, "num_tokens": 882135124.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3277289408551677, "frac_reward_zero_std": 1.0, "grad_norm": 0.005973690545093223, "kl": 0.0391845703125, "learning_rate": 1.7008655879390753e-05, "loss": 0.0016, "num_tokens": 882700756.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3278996330118631, "frac_reward_zero_std": 1.0, "grad_norm": 0.013077104041081677, "kl": 0.040771484375, "learning_rate": 1.7004404916024032e-05, "loss": 0.0016, "num_tokens": 883270356.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3280703251685585, "frac_reward_zero_std": 1.0, "grad_norm": 0.017474248188183186, "kl": 0.0413818359375, "learning_rate": 1.700015146634641e-05, "loss": 0.0017, "num_tokens": 883833572.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3282410173252539, "frac_reward_zero_std": 1.0, "grad_norm": 0.009402198475573886, "kl": 0.04052734375, "learning_rate": 1.6995895531867713e-05, "loss": 0.0016, "num_tokens": 884399268.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3284117094819493, "frac_reward_zero_std": 1.0, "grad_norm": 0.007016946636151437, "kl": 0.0445556640625, "learning_rate": 1.6991637114098634e-05, "loss": 0.0018, "num_tokens": 884970052.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3285824016386447, "frac_reward_zero_std": 1.0, "grad_norm": 0.010862866069717862, "kl": 0.04742431640625, "learning_rate": 1.698737621455077e-05, "loss": 0.0019, "num_tokens": 885541380.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3287530937953401, "frac_reward_zero_std": 1.0, "grad_norm": 0.011876454014911128, "kl": 0.05487060546875, "learning_rate": 1.698311283473657e-05, "loss": 0.0022, "num_tokens": 886106836.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32892378595203553, "frac_reward_zero_std": 1.0, "grad_norm": 0.013960794253774302, "kl": 0.0618896484375, "learning_rate": 1.6978846976169395e-05, "loss": 0.0025, "num_tokens": 886672420.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3290944781087309, "frac_reward_zero_std": 1.0, "grad_norm": 0.026157855463917707, "kl": 0.0731201171875, "learning_rate": 1.6974578640363466e-05, "loss": 0.0029, "num_tokens": 887238420.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3292651702654263, "frac_reward_zero_std": 1.0, "grad_norm": 0.0265138327870391, "kl": 0.07421875, "learning_rate": 1.6970307828833887e-05, "loss": 0.003, "num_tokens": 887807876.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3294358624221217, "frac_reward_zero_std": 1.0, "grad_norm": 0.024315332944880984, "kl": 0.07763671875, "learning_rate": 1.696603454309664e-05, "loss": 0.0031, "num_tokens": 888371044.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3296065545788171, "frac_reward_zero_std": 1.0, "grad_norm": 0.011308871853546884, "kl": 0.0634765625, "learning_rate": 1.696175878466859e-05, "loss": 0.0025, "num_tokens": 888948244.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3297772467355125, "frac_reward_zero_std": 1.0, "grad_norm": 0.011680292978822785, "kl": 0.0775146484375, "learning_rate": 1.695748055506748e-05, "loss": 0.0031, "num_tokens": 889515572.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3299479388922079, "frac_reward_zero_std": 1.0, "grad_norm": 0.012866752401468876, "kl": 0.072021484375, "learning_rate": 1.6953199855811925e-05, "loss": 0.0029, "num_tokens": 890080596.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3301186310489033, "frac_reward_zero_std": 1.0, "grad_norm": 0.009948797245723147, "kl": 0.069091796875, "learning_rate": 1.694891668842141e-05, "loss": 0.0028, "num_tokens": 890647812.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.33028932320559873, "frac_reward_zero_std": 1.0, "grad_norm": 0.012318740175197376, "kl": 0.0755615234375, "learning_rate": 1.694463105441632e-05, "loss": 0.003, "num_tokens": 891210948.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3304600153622941, "frac_reward_zero_std": 1.0, "grad_norm": 0.012937297564486996, "kl": 0.083251953125, "learning_rate": 1.6940342955317882e-05, "loss": 0.0033, "num_tokens": 891776164.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3306307075189895, "frac_reward_zero_std": 1.0, "grad_norm": 0.015525340828750026, "kl": 0.08984375, "learning_rate": 1.6936052392648236e-05, "loss": 0.0036, "num_tokens": 892342788.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3308013996756849, "frac_reward_zero_std": 1.0, "grad_norm": 0.027838893909905887, "kl": 0.0965576171875, "learning_rate": 1.6931759367930364e-05, "loss": 0.0039, "num_tokens": 892905364.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3309720918323803, "frac_reward_zero_std": 1.0, "grad_norm": 0.014878851304318153, "kl": 0.0775146484375, "learning_rate": 1.692746388268814e-05, "loss": 0.0031, "num_tokens": 893477908.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3311427839890757, "frac_reward_zero_std": 1.0, "grad_norm": 0.018570905665251915, "kl": 0.0823974609375, "learning_rate": 1.6923165938446303e-05, "loss": 0.0033, "num_tokens": 894040916.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3313134761457711, "frac_reward_zero_std": 1.0, "grad_norm": 0.007550827013743831, "kl": 0.0806884765625, "learning_rate": 1.6918865536730473e-05, "loss": 0.0032, "num_tokens": 894602596.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3314841683024665, "frac_reward_zero_std": 1.0, "grad_norm": 0.010394174723266673, "kl": 0.06329345703125, "learning_rate": 1.6914562679067132e-05, "loss": 0.0025, "num_tokens": 895192900.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3316548604591619, "frac_reward_zero_std": 1.0, "grad_norm": 0.007174314784339897, "kl": 0.0760498046875, "learning_rate": 1.691025736698364e-05, "loss": 0.003, "num_tokens": 895760212.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3318255526158573, "frac_reward_zero_std": 1.0, "grad_norm": 0.02172519728360886, "kl": 0.0906982421875, "learning_rate": 1.6905949602008228e-05, "loss": 0.0036, "num_tokens": 896322164.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3319962447725527, "frac_reward_zero_std": 1.0, "grad_norm": 0.010659320057154153, "kl": 0.07373046875, "learning_rate": 1.6901639385670002e-05, "loss": 0.003, "num_tokens": 896888644.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3321669369292481, "frac_reward_zero_std": 1.0, "grad_norm": 0.023418658681092518, "kl": 0.1004638671875, "learning_rate": 1.6897326719498926e-05, "loss": 0.004, "num_tokens": 897454612.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3323376290859435, "frac_reward_zero_std": 1.0, "grad_norm": 0.017112873238113287, "kl": 0.06268310546875, "learning_rate": 1.6893011605025845e-05, "loss": 0.0025, "num_tokens": 898023956.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3325083212426389, "frac_reward_zero_std": 1.0, "grad_norm": 0.02085663317197227, "kl": 0.065673828125, "learning_rate": 1.688869404378247e-05, "loss": 0.0026, "num_tokens": 898592452.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3326790133993343, "frac_reward_zero_std": 1.0, "grad_norm": 0.015585104907682877, "kl": 0.0684814453125, "learning_rate": 1.6884374037301375e-05, "loss": 0.0027, "num_tokens": 899156628.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3328497055560297, "frac_reward_zero_std": 1.0, "grad_norm": 0.01346767239983962, "kl": 0.0655517578125, "learning_rate": 1.6880051587116006e-05, "loss": 0.0026, "num_tokens": 899720244.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3330203977127251, "frac_reward_zero_std": 1.0, "grad_norm": 0.012464685342624855, "kl": 0.0645751953125, "learning_rate": 1.6875726694760682e-05, "loss": 0.0026, "num_tokens": 900293460.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3331910898694205, "frac_reward_zero_std": 1.0, "grad_norm": 0.020048444731511857, "kl": 0.0692138671875, "learning_rate": 1.687139936177058e-05, "loss": 0.0028, "num_tokens": 900854916.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3333617820261159, "frac_reward_zero_std": 1.0, "grad_norm": 0.014427162311646476, "kl": 0.077392578125, "learning_rate": 1.6867069589681748e-05, "loss": 0.0031, "num_tokens": 901417764.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3335324741828113, "frac_reward_zero_std": 1.0, "grad_norm": 0.009863650809047948, "kl": 0.07647705078125, "learning_rate": 1.6862737380031095e-05, "loss": 0.0031, "num_tokens": 901981332.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3337031663395067, "frac_reward_zero_std": 1.0, "grad_norm": 0.015052236616862613, "kl": 0.05645751953125, "learning_rate": 1.6858402734356406e-05, "loss": 0.0023, "num_tokens": 902549476.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3338738584962021, "frac_reward_zero_std": 1.0, "grad_norm": 0.015408237460178852, "kl": 0.05657958984375, "learning_rate": 1.685406565419632e-05, "loss": 0.0023, "num_tokens": 903115684.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3340445506528975, "frac_reward_zero_std": 1.0, "grad_norm": 0.011251988342075098, "kl": 0.067138671875, "learning_rate": 1.6849726141090346e-05, "loss": 0.0027, "num_tokens": 903678340.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3342152428095929, "frac_reward_zero_std": 1.0, "grad_norm": 0.007147738231219778, "kl": 0.0682373046875, "learning_rate": 1.684538419657885e-05, "loss": 0.0027, "num_tokens": 904240964.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3343859349662883, "frac_reward_zero_std": 1.0, "grad_norm": 0.010788663341923467, "kl": 0.0643310546875, "learning_rate": 1.684103982220307e-05, "loss": 0.0026, "num_tokens": 904804884.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3345566271229837, "frac_reward_zero_std": 1.0, "grad_norm": 0.02118531508837711, "kl": 0.07073974609375, "learning_rate": 1.68366930195051e-05, "loss": 0.0028, "num_tokens": 905367028.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3347273192796791, "frac_reward_zero_std": 1.0, "grad_norm": 0.009138709225061764, "kl": 0.0791015625, "learning_rate": 1.68323437900279e-05, "loss": 0.0032, "num_tokens": 905932340.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3348980114363745, "frac_reward_zero_std": 1.0, "grad_norm": 0.0193502656564982, "kl": 0.05511474609375, "learning_rate": 1.682799213531529e-05, "loss": 0.0022, "num_tokens": 906495316.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3350687035930699, "frac_reward_zero_std": 1.0, "grad_norm": 0.011841074533773048, "kl": 0.0584716796875, "learning_rate": 1.6823638056911944e-05, "loss": 0.0023, "num_tokens": 907057220.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3352393957497653, "frac_reward_zero_std": 1.0, "grad_norm": 0.01086952105554306, "kl": 0.0584716796875, "learning_rate": 1.6819281556363405e-05, "loss": 0.0023, "num_tokens": 907627268.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3354100879064607, "frac_reward_zero_std": 1.0, "grad_norm": 0.004780171785186772, "kl": 0.05810546875, "learning_rate": 1.681492263521608e-05, "loss": 0.0023, "num_tokens": 908197156.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3355807800631561, "frac_reward_zero_std": 1.0, "grad_norm": 0.03687340743925772, "kl": 0.0802001953125, "learning_rate": 1.6810561295017223e-05, "loss": 0.0032, "num_tokens": 908768100.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3357514722198515, "frac_reward_zero_std": 1.0, "grad_norm": 0.013341677667605589, "kl": 0.06292724609375, "learning_rate": 1.680619753731495e-05, "loss": 0.0025, "num_tokens": 909332564.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3359221643765469, "frac_reward_zero_std": 1.0, "grad_norm": 0.013856949049526389, "kl": 0.08477783203125, "learning_rate": 1.6801831363658245e-05, "loss": 0.0034, "num_tokens": 909899268.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3360928565332423, "frac_reward_zero_std": 1.0, "grad_norm": 0.011419370034841654, "kl": 0.05963134765625, "learning_rate": 1.679746277559694e-05, "loss": 0.0024, "num_tokens": 910472580.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3362635486899377, "frac_reward_zero_std": 1.0, "grad_norm": 0.007598965825379358, "kl": 0.05560302734375, "learning_rate": 1.679309177468172e-05, "loss": 0.0022, "num_tokens": 911042308.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3364342408466331, "frac_reward_zero_std": 1.0, "grad_norm": 0.01123242482140351, "kl": 0.0615234375, "learning_rate": 1.6788718362464134e-05, "loss": 0.0025, "num_tokens": 911608580.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3366049330033285, "frac_reward_zero_std": 1.0, "grad_norm": 0.007322550576926078, "kl": 0.06097412109375, "learning_rate": 1.6784342540496595e-05, "loss": 0.0024, "num_tokens": 912177732.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3367756251600239, "frac_reward_zero_std": 1.0, "grad_norm": 0.014315408404117907, "kl": 0.0760498046875, "learning_rate": 1.677996431033235e-05, "loss": 0.003, "num_tokens": 912742404.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3369463173167193, "frac_reward_zero_std": 1.0, "grad_norm": 0.009436301651221784, "kl": 0.07080078125, "learning_rate": 1.6775583673525518e-05, "loss": 0.0028, "num_tokens": 913311364.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3371170094734147, "frac_reward_zero_std": 1.0, "grad_norm": 0.07103595385993826, "kl": 0.078857421875, "learning_rate": 1.677120063163107e-05, "loss": 0.0032, "num_tokens": 913875316.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.33728770163011007, "frac_reward_zero_std": 1.0, "grad_norm": 0.017906749682455713, "kl": 0.088623046875, "learning_rate": 1.6766815186204816e-05, "loss": 0.0035, "num_tokens": 914442372.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3374583937868055, "frac_reward_zero_std": 1.0, "grad_norm": 0.04565958447405423, "kl": 0.113037109375, "learning_rate": 1.6762427338803446e-05, "loss": 0.0045, "num_tokens": 915008196.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3376290859435009, "frac_reward_zero_std": 1.0, "grad_norm": 0.8450510647556154, "kl": 0.232177734375, "learning_rate": 1.6758037090984477e-05, "loss": 0.0093, "num_tokens": 915578100.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3377997781001963, "frac_reward_zero_std": 1.0, "grad_norm": 0.36532858884366376, "kl": 0.6220703125, "learning_rate": 1.6753644444306293e-05, "loss": 0.0249, "num_tokens": 916144260.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3379704702568917, "frac_reward_zero_std": 1.0, "grad_norm": 0.4031272820841342, "kl": 0.9921875, "learning_rate": 1.6749249400328126e-05, "loss": 0.0397, "num_tokens": 916714500.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3381411624135871, "frac_reward_zero_std": 1.0, "grad_norm": 0.26123037721207115, "kl": 0.5087890625, "learning_rate": 1.6744851960610056e-05, "loss": 0.0203, "num_tokens": 917283716.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3383118545702825, "frac_reward_zero_std": 1.0, "grad_norm": 0.20144976233901754, "kl": 0.379638671875, "learning_rate": 1.674045212671301e-05, "loss": 0.0152, "num_tokens": 917851412.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3384825467269779, "frac_reward_zero_std": 1.0, "grad_norm": 0.14534811161610736, "kl": 0.242431640625, "learning_rate": 1.6736049900198777e-05, "loss": 0.0097, "num_tokens": 918418596.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.33865323888367327, "frac_reward_zero_std": 1.0, "grad_norm": 0.09715710749437803, "kl": 0.153076171875, "learning_rate": 1.6731645282629992e-05, "loss": 0.0061, "num_tokens": 918987924.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3388239310403687, "frac_reward_zero_std": 1.0, "grad_norm": 0.0731017651849606, "kl": 0.1187744140625, "learning_rate": 1.6727238275570123e-05, "loss": 0.0047, "num_tokens": 919554036.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3389946231970641, "frac_reward_zero_std": 1.0, "grad_norm": 0.11121518040818167, "kl": 0.1405029296875, "learning_rate": 1.672282888058351e-05, "loss": 0.0056, "num_tokens": 920120020.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3391653153537595, "frac_reward_zero_std": 1.0, "grad_norm": 0.08308575847305164, "kl": 0.1163330078125, "learning_rate": 1.6718417099235323e-05, "loss": 0.0047, "num_tokens": 920686964.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3393360075104549, "frac_reward_zero_std": 1.0, "grad_norm": 0.07200430881415232, "kl": 0.1204833984375, "learning_rate": 1.6714002933091588e-05, "loss": 0.0048, "num_tokens": 921250772.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3395066996671503, "frac_reward_zero_std": 1.0, "grad_norm": 0.03749096046793016, "kl": 0.0831298828125, "learning_rate": 1.6709586383719175e-05, "loss": 0.0033, "num_tokens": 921813956.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3396773918238457, "frac_reward_zero_std": 1.0, "grad_norm": 0.06379204324569757, "kl": 0.079345703125, "learning_rate": 1.670516745268579e-05, "loss": 0.0032, "num_tokens": 922381796.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3398480839805411, "frac_reward_zero_std": 1.0, "grad_norm": 0.03154125207619222, "kl": 0.1075439453125, "learning_rate": 1.670074614156001e-05, "loss": 0.0043, "num_tokens": 922945028.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34001877613723647, "frac_reward_zero_std": 1.0, "grad_norm": 0.04967152173478134, "kl": 0.12158203125, "learning_rate": 1.6696322451911232e-05, "loss": 0.0049, "num_tokens": 923510868.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3401894682939319, "frac_reward_zero_std": 1.0, "grad_norm": 0.4635070353166631, "kl": 0.225830078125, "learning_rate": 1.66918963853097e-05, "loss": 0.009, "num_tokens": 924079812.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3403601604506273, "frac_reward_zero_std": 1.0, "grad_norm": 0.11834335753650957, "kl": 0.208251953125, "learning_rate": 1.6687467943326525e-05, "loss": 0.0083, "num_tokens": 924643796.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3405308526073227, "frac_reward_zero_std": 1.0, "grad_norm": 0.4056326493464838, "kl": 0.245361328125, "learning_rate": 1.6683037127533625e-05, "loss": 0.0098, "num_tokens": 925210692.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3407015447640181, "frac_reward_zero_std": 1.0, "grad_norm": 0.1233633344327869, "kl": 0.25927734375, "learning_rate": 1.6678603939503796e-05, "loss": 0.0104, "num_tokens": 925777988.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3408722369207135, "frac_reward_zero_std": 1.0, "grad_norm": 0.05037984915795418, "kl": 0.24267578125, "learning_rate": 1.6674168380810647e-05, "loss": 0.0097, "num_tokens": 926339236.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3410429290774089, "frac_reward_zero_std": 1.0, "grad_norm": 0.18561982217050468, "kl": 0.318359375, "learning_rate": 1.6669730453028646e-05, "loss": 0.0127, "num_tokens": 926904148.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3412136212341043, "frac_reward_zero_std": 1.0, "grad_norm": 0.163329175262394, "kl": 0.26318359375, "learning_rate": 1.66652901577331e-05, "loss": 0.0105, "num_tokens": 927466676.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34138431339079967, "frac_reward_zero_std": 1.0, "grad_norm": 0.1818649359839612, "kl": 0.2001953125, "learning_rate": 1.6660847496500153e-05, "loss": 0.008, "num_tokens": 928037300.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34155500554749507, "frac_reward_zero_std": 1.0, "grad_norm": 0.17980720069932263, "kl": 0.216064453125, "learning_rate": 1.6656402470906785e-05, "loss": 0.0086, "num_tokens": 928603796.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3417256977041905, "frac_reward_zero_std": 1.0, "grad_norm": 0.17023919408072577, "kl": 0.201904296875, "learning_rate": 1.665195508253082e-05, "loss": 0.0081, "num_tokens": 929178420.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3418963898608859, "frac_reward_zero_std": 1.0, "grad_norm": 0.1580450548359176, "kl": 0.2513427734375, "learning_rate": 1.6647505332950925e-05, "loss": 0.0101, "num_tokens": 929748820.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3420670820175813, "frac_reward_zero_std": 1.0, "grad_norm": 0.18528817517982002, "kl": 0.241943359375, "learning_rate": 1.66430532237466e-05, "loss": 0.0097, "num_tokens": 930314548.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3422377741742767, "frac_reward_zero_std": 1.0, "grad_norm": 1.1162582126198928, "kl": 0.3800048828125, "learning_rate": 1.6638598756498178e-05, "loss": 0.0152, "num_tokens": 930889828.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3424084663309721, "frac_reward_zero_std": 1.0, "grad_norm": 0.05447406517235757, "kl": 0.4375, "learning_rate": 1.6634141932786837e-05, "loss": 0.0175, "num_tokens": 931450724.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3425791584876675, "frac_reward_zero_std": 1.0, "grad_norm": 0.1924327525935145, "kl": 0.6337890625, "learning_rate": 1.662968275419459e-05, "loss": 0.0254, "num_tokens": 932014132.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3427498506443629, "frac_reward_zero_std": 1.0, "grad_norm": 0.23613702575018072, "kl": 0.9306640625, "learning_rate": 1.6625221222304284e-05, "loss": 0.0373, "num_tokens": 932581780.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34292054280105827, "frac_reward_zero_std": 1.0, "grad_norm": 1.615392420901322, "kl": 1.31640625, "learning_rate": 1.66207573386996e-05, "loss": 0.0527, "num_tokens": 933143652.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1923.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 1642.0, "completions/min_terminated_length": 0.0, "epoch": 0.3430912349577537, "frac_reward_zero_std": 1.0, "grad_norm": 0.6482275956926529, "kl": 1.2421875, "learning_rate": 1.6616291104965056e-05, "loss": 0.0497, "num_tokens": 933684212.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2044.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2016.0, "completions/min_terminated_length": 0.0, "epoch": 0.3432619271144491, "frac_reward_zero_std": 1.0, "grad_norm": 1.5182138214692718, "kl": 1.193359375, "learning_rate": 1.6611822522686e-05, "loss": 0.0477, "num_tokens": 934247700.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3434326192711445, "frac_reward_zero_std": 1.0, "grad_norm": 2.6834012610111864, "kl": 1.3984375, "learning_rate": 1.6607351593448626e-05, "loss": 0.056, "num_tokens": 934808660.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3436033114278399, "frac_reward_zero_std": 1.0, "grad_norm": 0.9612992156600203, "kl": 1.16015625, "learning_rate": 1.660287831883995e-05, "loss": 0.0464, "num_tokens": 935375092.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2008.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 1797.0, "completions/min_terminated_length": 0.0, "epoch": 0.3437740035845353, "frac_reward_zero_std": 1.0, "grad_norm": 1.3209917102606044, "kl": 1.27734375, "learning_rate": 1.6598402700447816e-05, "loss": 0.0511, "num_tokens": 935926148.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3439446957412307, "frac_reward_zero_std": 1.0, "grad_norm": 1.1073574399382842, "kl": 1.0390625, "learning_rate": 1.6593924739860917e-05, "loss": 0.0416, "num_tokens": 936490964.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3441153878979261, "frac_reward_zero_std": 1.0, "grad_norm": 0.305322762584208, "kl": 0.7578125, "learning_rate": 1.658944443866876e-05, "loss": 0.0303, "num_tokens": 937059492.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34428608005462147, "frac_reward_zero_std": 1.0, "grad_norm": 1.3641101877091308, "kl": 0.638671875, "learning_rate": 1.658496179846169e-05, "loss": 0.0255, "num_tokens": 937628228.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3444567722113169, "frac_reward_zero_std": 1.0, "grad_norm": 0.9025460909135268, "kl": 0.56201171875, "learning_rate": 1.658047682083089e-05, "loss": 0.0225, "num_tokens": 938200004.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3446274643680123, "frac_reward_zero_std": 1.0, "grad_norm": 0.9647865655419913, "kl": 0.822265625, "learning_rate": 1.6575989507368356e-05, "loss": 0.0329, "num_tokens": 938766692.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3447981565247077, "frac_reward_zero_std": 1.0, "grad_norm": 0.9783048949455035, "kl": 1.0263671875, "learning_rate": 1.6571499859666928e-05, "loss": 0.041, "num_tokens": 939330308.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3449688486814031, "frac_reward_zero_std": 1.0, "grad_norm": 1.878175319928716, "kl": 1.416015625, "learning_rate": 1.6567007879320266e-05, "loss": 0.0567, "num_tokens": 939896100.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3451395408380985, "frac_reward_zero_std": 1.0, "grad_norm": 1.6590849466682343, "kl": 1.3984375, "learning_rate": 1.6562513567922867e-05, "loss": 0.056, "num_tokens": 940459940.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3453102329947939, "frac_reward_zero_std": 1.0, "grad_norm": 1.289139317970155, "kl": 1.21875, "learning_rate": 1.6558016927070043e-05, "loss": 0.0488, "num_tokens": 941025044.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3454809251514893, "frac_reward_zero_std": 1.0, "grad_norm": 0.5141928795890918, "kl": 0.919921875, "learning_rate": 1.655351795835794e-05, "loss": 0.0368, "num_tokens": 941587636.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34565161730818467, "frac_reward_zero_std": 1.0, "grad_norm": 0.34297523730142354, "kl": 0.572265625, "learning_rate": 1.6549016663383527e-05, "loss": 0.0229, "num_tokens": 942163988.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3458223094648801, "frac_reward_zero_std": 1.0, "grad_norm": 0.2850603298783978, "kl": 0.4375, "learning_rate": 1.6544513043744606e-05, "loss": 0.0175, "num_tokens": 942726340.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3459930016215755, "frac_reward_zero_std": 1.0, "grad_norm": 0.6046263577788037, "kl": 0.4423828125, "learning_rate": 1.6540007101039796e-05, "loss": 0.0177, "num_tokens": 943283412.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3461636937782709, "frac_reward_zero_std": 1.0, "grad_norm": 0.5889839162466722, "kl": 0.47119140625, "learning_rate": 1.6535498836868548e-05, "loss": 0.0188, "num_tokens": 943850580.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3463343859349663, "frac_reward_zero_std": 1.0, "grad_norm": 684.1505762289214, "kl": 74.494140625, "learning_rate": 1.6530988252831134e-05, "loss": 2.9973, "num_tokens": 944412132.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3465050780916617, "frac_reward_zero_std": 1.0, "grad_norm": 5.794946656229915, "kl": 1.99609375, "learning_rate": 1.652647535052864e-05, "loss": 0.08, "num_tokens": 944981188.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3466757702483571, "frac_reward_zero_std": 1.0, "grad_norm": 6.213777916575745, "kl": 2.59765625, "learning_rate": 1.6521960131562993e-05, "loss": 0.104, "num_tokens": 945544036.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1959.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 1342.0, "completions/min_terminated_length": 0.0, "epoch": 0.3468464624050525, "frac_reward_zero_std": 1.0, "grad_norm": 3.1513553856548158, "kl": 2.125, "learning_rate": 1.6517442597536926e-05, "loss": 0.0851, "num_tokens": 946085972.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1312.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 976.0, "completions/min_terminated_length": 0.0, "epoch": 0.34701715456174786, "frac_reward_zero_std": 1.0, "grad_norm": 0.7819238958814486, "kl": 2.24609375, "learning_rate": 1.6512922750054003e-05, "loss": 0.0897, "num_tokens": 946458164.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 920.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 650.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 476.0, "completions/min_terminated_length": 0.0, "epoch": 0.34718784671844327, "frac_reward_zero_std": 1.0, "grad_norm": 1.877139668241737, "kl": 3.3984375, "learning_rate": 1.650840059071861e-05, "loss": 0.136, "num_tokens": 946679668.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 977.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 775.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 470.0, "completions/min_terminated_length": 0.0, "epoch": 0.3473585388751387, "frac_reward_zero_std": 1.0, "grad_norm": 1.962971171891792, "kl": 3.24609375, "learning_rate": 1.650387612113594e-05, "loss": 0.13, "num_tokens": 946920356.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 874.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 602.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 478.0, "completions/min_terminated_length": 0.0, "epoch": 0.3475292310318341, "frac_reward_zero_std": 1.0, "grad_norm": 1.885712218419248, "kl": 3.90234375, "learning_rate": 1.6499349342912033e-05, "loss": 0.156, "num_tokens": 947121428.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 868.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 720.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 499.0, "completions/min_terminated_length": 0.0, "epoch": 0.3476999231885295, "frac_reward_zero_std": 1.0, "grad_norm": 1.5170300757710953, "kl": 3.40625, "learning_rate": 1.6494820257653716e-05, "loss": 0.1361, "num_tokens": 947339220.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1350.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 889.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 560.0, "completions/min_terminated_length": 0.0, "epoch": 0.3478706153452249, "frac_reward_zero_std": 1.0, "grad_norm": 0.8785621699753657, "kl": 3.16796875, "learning_rate": 1.6490288866968658e-05, "loss": 0.1267, "num_tokens": 947610404.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1665.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1317.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 918.0, "completions/min_terminated_length": 0.0, "epoch": 0.3480413075019203, "frac_reward_zero_std": 1.0, "grad_norm": 1.6507692998932495, "kl": 2.65234375, "learning_rate": 1.6485755172465338e-05, "loss": 0.106, "num_tokens": 947989108.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1805.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1320.0, "completions/min_terminated_length": 0.0, "epoch": 0.3482119996586157, "frac_reward_zero_std": 1.0, "grad_norm": 2.9753482558800592, "kl": 2.515625, "learning_rate": 1.6481219175753054e-05, "loss": 0.1007, "num_tokens": 948490420.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1825.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 1425.0, "completions/min_terminated_length": 0.0, "epoch": 0.34838269181531106, "frac_reward_zero_std": 1.0, "grad_norm": 2.6848703834767678, "kl": 2.42578125, "learning_rate": 1.6476680878441925e-05, "loss": 0.0971, "num_tokens": 948994004.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1619.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1152.0, "completions/min_terminated_length": 0.0, "epoch": 0.34855338397200647, "frac_reward_zero_std": 1.0, "grad_norm": 0.8960515373370652, "kl": 2.4609375, "learning_rate": 1.6472140282142874e-05, "loss": 0.0986, "num_tokens": 949446948.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1984.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1470.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1035.0, "completions/min_terminated_length": 0.0, "epoch": 0.3487240761287019, "frac_reward_zero_std": 1.0, "grad_norm": 0.10694943659294792, "kl": 2.38671875, "learning_rate": 1.6467597388467656e-05, "loss": 0.0953, "num_tokens": 949863460.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1032.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 798.0, "completions/min_terminated_length": 0.0, "epoch": 0.3488947682853973, "frac_reward_zero_std": 1.0, "grad_norm": 1.1511372835678628, "kl": 2.92578125, "learning_rate": 1.646305219902883e-05, "loss": 0.117, "num_tokens": 950166148.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1531.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1215.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 490.0, "completions/min_terminated_length": 0.0, "epoch": 0.3490654604420927, "frac_reward_zero_std": 1.0, "grad_norm": 0.9104327315128892, "kl": 2.72265625, "learning_rate": 1.645850471543978e-05, "loss": 0.1087, "num_tokens": 950514372.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1324.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1011.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 664.0, "completions/min_terminated_length": 0.0, "epoch": 0.3492361525987881, "frac_reward_zero_std": 1.0, "grad_norm": 0.5793442153304551, "kl": 2.765625, "learning_rate": 1.6453954939314693e-05, "loss": 0.1105, "num_tokens": 950817108.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1935.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1405.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 1073.0, "completions/min_terminated_length": 0.0, "epoch": 0.3494068447554835, "frac_reward_zero_std": 1.0, "grad_norm": 0.11710821319452411, "kl": 2.4140625, "learning_rate": 1.6449402872268574e-05, "loss": 0.0964, "num_tokens": 951212260.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1322.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 864.0, "completions/min_terminated_length": 0.0, "epoch": 0.3495775369121789, "frac_reward_zero_std": 1.0, "grad_norm": 0.9075041056579661, "kl": 2.51953125, "learning_rate": 1.6444848515917248e-05, "loss": 0.1007, "num_tokens": 951586516.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1962.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1528.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 1141.0, "completions/min_terminated_length": 0.0, "epoch": 0.34974822906887426, "frac_reward_zero_std": 1.0, "grad_norm": 0.8270206080442818, "kl": 2.25390625, "learning_rate": 1.6440291871877343e-05, "loss": 0.0901, "num_tokens": 952024468.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1855.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1494.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 694.0, "completions/min_terminated_length": 0.0, "epoch": 0.34991892122556967, "frac_reward_zero_std": 1.0, "grad_norm": 0.5250793448039617, "kl": 2.375, "learning_rate": 1.6435732941766308e-05, "loss": 0.0949, "num_tokens": 952446868.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1498.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 940.0, "completions/min_terminated_length": 0.0, "epoch": 0.3500896133822651, "frac_reward_zero_std": 1.0, "grad_norm": 0.24836063909702302, "kl": 2.38671875, "learning_rate": 1.6431171727202393e-05, "loss": 0.0954, "num_tokens": 952867684.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1978.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1395.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 856.0, "completions/min_terminated_length": 0.0, "epoch": 0.3502603055389605, "frac_reward_zero_std": 1.0, "grad_norm": 0.41946167465541845, "kl": 2.3125, "learning_rate": 1.6426608229804665e-05, "loss": 0.0925, "num_tokens": 953268676.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1459.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1015.0, "completions/min_terminated_length": 0.0, "epoch": 0.3504309976956559, "frac_reward_zero_std": 1.0, "grad_norm": 0.4046730870527007, "kl": 2.35546875, "learning_rate": 1.6422042451193005e-05, "loss": 0.0944, "num_tokens": 953684644.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1352.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 1022.0, "completions/min_terminated_length": 0.0, "epoch": 0.3506016898523513, "frac_reward_zero_std": 1.0, "grad_norm": 0.27590515121428716, "kl": 2.48828125, "learning_rate": 1.6417474392988097e-05, "loss": 0.0997, "num_tokens": 954065812.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1734.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1467.375, "completions/mean_terminated_length": 0.0, "completions/min_length": 1213.0, "completions/min_terminated_length": 0.0, "epoch": 0.3507723820090467, "frac_reward_zero_std": 1.0, "grad_norm": 0.7658711571278859, "kl": 2.45703125, "learning_rate": 1.6412904056811436e-05, "loss": 0.0982, "num_tokens": 954481620.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1488.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 871.0, "completions/min_terminated_length": 0.0, "epoch": 0.3509430741657421, "frac_reward_zero_std": 1.0, "grad_norm": 0.28368288054640073, "kl": 2.359375, "learning_rate": 1.640833144428533e-05, "loss": 0.0944, "num_tokens": 954900228.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1506.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1277.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1067.0, "completions/min_terminated_length": 0.0, "epoch": 0.35111376632243746, "frac_reward_zero_std": 1.0, "grad_norm": 0.23209592470704693, "kl": 2.4375, "learning_rate": 1.6403756557032885e-05, "loss": 0.0974, "num_tokens": 955267348.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1578.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1183.0, "completions/min_terminated_length": 0.0, "epoch": 0.35128445847913287, "frac_reward_zero_std": 1.0, "grad_norm": 0.19935783900521528, "kl": 2.14453125, "learning_rate": 1.6399179396678028e-05, "loss": 0.0858, "num_tokens": 955710404.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1694.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1175.0, "completions/min_terminated_length": 0.0, "epoch": 0.3514551506358283, "frac_reward_zero_std": 1.0, "grad_norm": 0.15803669837231038, "kl": 1.95703125, "learning_rate": 1.639459996484548e-05, "loss": 0.0784, "num_tokens": 956183604.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1599.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 958.0, "completions/min_terminated_length": 0.0, "epoch": 0.3516258427925237, "frac_reward_zero_std": 1.0, "grad_norm": 0.14109417881359523, "kl": 1.85546875, "learning_rate": 1.639001826316078e-05, "loss": 0.0742, "num_tokens": 956644292.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1919.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1652.0, "completions/min_terminated_length": 0.0, "epoch": 0.3517965349492191, "frac_reward_zero_std": 1.0, "grad_norm": 0.10198437302244753, "kl": 1.70703125, "learning_rate": 1.6385434293250257e-05, "loss": 0.0683, "num_tokens": 957176788.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1960.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 1699.0, "completions/min_terminated_length": 0.0, "epoch": 0.3519672271059145, "frac_reward_zero_std": 1.0, "grad_norm": 0.18234121314112178, "kl": 1.53515625, "learning_rate": 1.6380848056741062e-05, "loss": 0.0614, "num_tokens": 957729844.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3521379192626099, "frac_reward_zero_std": 1.0, "grad_norm": 0.7425393345653359, "kl": 1.189453125, "learning_rate": 1.6376259555261148e-05, "loss": 0.0476, "num_tokens": 958297300.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3523086114193053, "frac_reward_zero_std": 1.0, "grad_norm": 0.2786730998888626, "kl": 1.150390625, "learning_rate": 1.6371668790439258e-05, "loss": 0.0459, "num_tokens": 958859764.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35247930357600066, "frac_reward_zero_std": 1.0, "grad_norm": 0.18921378916774917, "kl": 0.9208984375, "learning_rate": 1.636707576390495e-05, "loss": 0.0368, "num_tokens": 959425572.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35264999573269606, "frac_reward_zero_std": 1.0, "grad_norm": 0.38588099935600156, "kl": 0.7802734375, "learning_rate": 1.6362480477288585e-05, "loss": 0.0312, "num_tokens": 959989908.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35282068788939147, "frac_reward_zero_std": 1.0, "grad_norm": 0.31630536327245756, "kl": 0.5615234375, "learning_rate": 1.6357882932221323e-05, "loss": 0.0225, "num_tokens": 960555396.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3529913800460869, "frac_reward_zero_std": 1.0, "grad_norm": 0.2322657881027432, "kl": 0.33984375, "learning_rate": 1.6353283130335126e-05, "loss": 0.0136, "num_tokens": 961115636.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3531620722027823, "frac_reward_zero_std": 1.0, "grad_norm": 0.13332742263554506, "kl": 0.215576171875, "learning_rate": 1.634868107326276e-05, "loss": 0.0086, "num_tokens": 961676996.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3533327643594777, "frac_reward_zero_std": 1.0, "grad_norm": 0.10022751488145094, "kl": 0.1199951171875, "learning_rate": 1.6344076762637785e-05, "loss": 0.0048, "num_tokens": 962248372.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3535034565161731, "frac_reward_zero_std": 1.0, "grad_norm": 0.03564903682825387, "kl": 0.076171875, "learning_rate": 1.633947020009457e-05, "loss": 0.003, "num_tokens": 962816852.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3536741486728685, "frac_reward_zero_std": 1.0, "grad_norm": 0.02023279460905421, "kl": 0.05743408203125, "learning_rate": 1.6334861387268276e-05, "loss": 0.0023, "num_tokens": 963384132.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35384484082956386, "frac_reward_zero_std": 1.0, "grad_norm": 0.03497304865757194, "kl": 0.05194091796875, "learning_rate": 1.633025032579486e-05, "loss": 0.0021, "num_tokens": 963951316.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35401553298625926, "frac_reward_zero_std": 1.0, "grad_norm": 0.05172612262698176, "kl": 0.0633544921875, "learning_rate": 1.632563701731109e-05, "loss": 0.0025, "num_tokens": 964513012.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35418622514295467, "frac_reward_zero_std": 1.0, "grad_norm": 0.03134696568605817, "kl": 0.0416259765625, "learning_rate": 1.632102146345452e-05, "loss": 0.0017, "num_tokens": 965072804.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3543569172996501, "frac_reward_zero_std": 1.0, "grad_norm": 0.04048496347622501, "kl": 0.03802490234375, "learning_rate": 1.6316403665863506e-05, "loss": 0.0015, "num_tokens": 965638324.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3545276094563455, "frac_reward_zero_std": 1.0, "grad_norm": 0.007418647016909164, "kl": 0.031219482421875, "learning_rate": 1.6311783626177203e-05, "loss": 0.0012, "num_tokens": 966200916.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3546983016130409, "frac_reward_zero_std": 1.0, "grad_norm": 0.0086214719721992, "kl": 0.029449462890625, "learning_rate": 1.6307161346035553e-05, "loss": 0.0012, "num_tokens": 966765172.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3548689937697363, "frac_reward_zero_std": 1.0, "grad_norm": 0.04730711928779384, "kl": 0.044677734375, "learning_rate": 1.63025368270793e-05, "loss": 0.0018, "num_tokens": 967328836.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3550396859264317, "frac_reward_zero_std": 1.0, "grad_norm": 0.028045293085926737, "kl": 0.03424072265625, "learning_rate": 1.6297910070949986e-05, "loss": 0.0014, "num_tokens": 967892020.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35521037808312705, "frac_reward_zero_std": 1.0, "grad_norm": 0.003341464501493143, "kl": 0.02703857421875, "learning_rate": 1.6293281079289934e-05, "loss": 0.0011, "num_tokens": 968456484.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35538107023982246, "frac_reward_zero_std": 1.0, "grad_norm": 0.012685104514705892, "kl": 0.02880859375, "learning_rate": 1.6288649853742274e-05, "loss": 0.0012, "num_tokens": 969028692.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35555176239651787, "frac_reward_zero_std": 1.0, "grad_norm": 0.006556471530005801, "kl": 0.02801513671875, "learning_rate": 1.6284016395950924e-05, "loss": 0.0011, "num_tokens": 969593140.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3557224545532133, "frac_reward_zero_std": 1.0, "grad_norm": 0.01643925517114842, "kl": 0.02825927734375, "learning_rate": 1.627938070756059e-05, "loss": 0.0011, "num_tokens": 970157188.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3558931467099087, "frac_reward_zero_std": 1.0, "grad_norm": 0.001534129629371016, "kl": 0.02423095703125, "learning_rate": 1.6274742790216783e-05, "loss": 0.001, "num_tokens": 970730724.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3560638388666041, "frac_reward_zero_std": 1.0, "grad_norm": 0.00792605569576655, "kl": 0.028839111328125, "learning_rate": 1.627010264556579e-05, "loss": 0.0012, "num_tokens": 971298100.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3562345310232995, "frac_reward_zero_std": 1.0, "grad_norm": 0.023532753993744545, "kl": 0.02838134765625, "learning_rate": 1.626546027525469e-05, "loss": 0.0011, "num_tokens": 971869732.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3564052231799949, "frac_reward_zero_std": 1.0, "grad_norm": 0.012821313161642929, "kl": 0.030517578125, "learning_rate": 1.6260815680931368e-05, "loss": 0.0012, "num_tokens": 972438020.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35657591533669025, "frac_reward_zero_std": 1.0, "grad_norm": 0.013652606242964231, "kl": 0.02880859375, "learning_rate": 1.6256168864244478e-05, "loss": 0.0012, "num_tokens": 973010612.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35674660749338566, "frac_reward_zero_std": 1.0, "grad_norm": 0.016435050846209772, "kl": 0.0301513671875, "learning_rate": 1.6251519826843477e-05, "loss": 0.0012, "num_tokens": 973575972.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35691729965008107, "frac_reward_zero_std": 1.0, "grad_norm": 0.02847407515975374, "kl": 0.031280517578125, "learning_rate": 1.6246868570378608e-05, "loss": 0.0013, "num_tokens": 974143156.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3570879918067765, "frac_reward_zero_std": 1.0, "grad_norm": 0.050232466724922, "kl": 0.029083251953125, "learning_rate": 1.624221509650089e-05, "loss": 0.0012, "num_tokens": 974708132.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3572586839634719, "frac_reward_zero_std": 1.0, "grad_norm": 0.12169336956434464, "kl": 0.046722412109375, "learning_rate": 1.6237559406862145e-05, "loss": 0.0019, "num_tokens": 975277924.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3574293761201673, "frac_reward_zero_std": 1.0, "grad_norm": 0.013831322683152354, "kl": 0.02960205078125, "learning_rate": 1.6232901503114976e-05, "loss": 0.0012, "num_tokens": 975843956.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3576000682768627, "frac_reward_zero_std": 1.0, "grad_norm": 0.032736978316166614, "kl": 0.033172607421875, "learning_rate": 1.622824138691277e-05, "loss": 0.0013, "num_tokens": 976412372.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3577707604335581, "frac_reward_zero_std": 1.0, "grad_norm": 0.04072269930908554, "kl": 0.0469970703125, "learning_rate": 1.6223579059909693e-05, "loss": 0.0019, "num_tokens": 976982548.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35794145259025345, "frac_reward_zero_std": 1.0, "grad_norm": 0.014530745371357406, "kl": 0.02935791015625, "learning_rate": 1.6218914523760712e-05, "loss": 0.0012, "num_tokens": 977549268.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35811214474694886, "frac_reward_zero_std": 1.0, "grad_norm": 0.04582467574047161, "kl": 0.037017822265625, "learning_rate": 1.6214247780121563e-05, "loss": 0.0015, "num_tokens": 978120228.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35828283690364426, "frac_reward_zero_std": 1.0, "grad_norm": 0.046264210602457397, "kl": 0.0435791015625, "learning_rate": 1.620957883064877e-05, "loss": 0.0017, "num_tokens": 978686756.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35845352906033967, "frac_reward_zero_std": 1.0, "grad_norm": 0.03241188300238624, "kl": 0.0380859375, "learning_rate": 1.6204907676999652e-05, "loss": 0.0015, "num_tokens": 979256212.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3586242212170351, "frac_reward_zero_std": 1.0, "grad_norm": 0.021812167670272293, "kl": 0.04034423828125, "learning_rate": 1.620023432083229e-05, "loss": 0.0016, "num_tokens": 979820404.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3587949133737305, "frac_reward_zero_std": 1.0, "grad_norm": 0.017265108571219916, "kl": 0.04388427734375, "learning_rate": 1.619555876380556e-05, "loss": 0.0018, "num_tokens": 980384948.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3589656055304259, "frac_reward_zero_std": 1.0, "grad_norm": 0.024941159657407876, "kl": 0.0567626953125, "learning_rate": 1.6190881007579114e-05, "loss": 0.0023, "num_tokens": 980946324.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3591362976871213, "frac_reward_zero_std": 1.0, "grad_norm": 0.03262973140259064, "kl": 0.08001708984375, "learning_rate": 1.618620105381339e-05, "loss": 0.0032, "num_tokens": 981512372.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3593069898438167, "frac_reward_zero_std": 1.0, "grad_norm": 0.04950166234994395, "kl": 0.086669921875, "learning_rate": 1.61815189041696e-05, "loss": 0.0035, "num_tokens": 982079844.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35947768200051206, "frac_reward_zero_std": 1.0, "grad_norm": 0.0457114927320607, "kl": 0.1346435546875, "learning_rate": 1.617683456030974e-05, "loss": 0.0054, "num_tokens": 982647220.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35964837415720746, "frac_reward_zero_std": 1.0, "grad_norm": 0.08174639155269912, "kl": 0.1436767578125, "learning_rate": 1.617214802389658e-05, "loss": 0.0058, "num_tokens": 983226964.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35981906631390287, "frac_reward_zero_std": 1.0, "grad_norm": 0.09183331847205861, "kl": 0.16015625, "learning_rate": 1.6167459296593674e-05, "loss": 0.0064, "num_tokens": 983790148.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3599897584705983, "frac_reward_zero_std": 1.0, "grad_norm": 0.16740029761712724, "kl": 0.261474609375, "learning_rate": 1.616276838006535e-05, "loss": 0.0105, "num_tokens": 984352420.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3601604506272937, "frac_reward_zero_std": 1.0, "grad_norm": 0.1563632668387216, "kl": 0.2666015625, "learning_rate": 1.6158075275976716e-05, "loss": 0.0107, "num_tokens": 984918820.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3603311427839891, "frac_reward_zero_std": 1.0, "grad_norm": 0.06384976078994643, "kl": 0.32958984375, "learning_rate": 1.6153379985993655e-05, "loss": 0.0132, "num_tokens": 985479508.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3605018349406845, "frac_reward_zero_std": 1.0, "grad_norm": 0.10677800671690077, "kl": 0.37255859375, "learning_rate": 1.6148682511782817e-05, "loss": 0.0149, "num_tokens": 986048292.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3606725270973799, "frac_reward_zero_std": 1.0, "grad_norm": 0.07482153497328843, "kl": 0.36572265625, "learning_rate": 1.6143982855011646e-05, "loss": 0.0146, "num_tokens": 986608116.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36084321925407525, "frac_reward_zero_std": 1.0, "grad_norm": 0.08218583037983562, "kl": 0.33544921875, "learning_rate": 1.6139281017348347e-05, "loss": 0.0134, "num_tokens": 987173044.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36101391141077066, "frac_reward_zero_std": 1.0, "grad_norm": 0.20152234188201962, "kl": 0.31982421875, "learning_rate": 1.61345770004619e-05, "loss": 0.0128, "num_tokens": 987743588.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36118460356746607, "frac_reward_zero_std": 1.0, "grad_norm": 0.12228859980893578, "kl": 0.276123046875, "learning_rate": 1.6129870806022066e-05, "loss": 0.0111, "num_tokens": 988307508.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3613552957241615, "frac_reward_zero_std": 1.0, "grad_norm": 0.031044290270059487, "kl": 0.228759765625, "learning_rate": 1.6125162435699368e-05, "loss": 0.0091, "num_tokens": 988868404.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3615259878808569, "frac_reward_zero_std": 1.0, "grad_norm": 0.11216180332225688, "kl": 0.2080078125, "learning_rate": 1.612045189116511e-05, "loss": 0.0083, "num_tokens": 989428004.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3616966800375523, "frac_reward_zero_std": 1.0, "grad_norm": 0.1510663646349331, "kl": 0.218994140625, "learning_rate": 1.6115739174091372e-05, "loss": 0.0088, "num_tokens": 989991924.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3618673721942477, "frac_reward_zero_std": 1.0, "grad_norm": 0.08131676377299851, "kl": 0.197509765625, "learning_rate": 1.611102428615099e-05, "loss": 0.0079, "num_tokens": 990557460.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3620380643509431, "frac_reward_zero_std": 1.0, "grad_norm": 0.10705470644770507, "kl": 0.252197265625, "learning_rate": 1.610630722901758e-05, "loss": 0.0101, "num_tokens": 991129636.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36220875650763845, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830723238783115, "kl": 0.21337890625, "learning_rate": 1.6101588004365532e-05, "loss": 0.0085, "num_tokens": 991702532.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36237944866433386, "frac_reward_zero_std": 1.0, "grad_norm": 0.08578457742085872, "kl": 0.288330078125, "learning_rate": 1.6096866613869993e-05, "loss": 0.0115, "num_tokens": 992264084.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36255014082102927, "frac_reward_zero_std": 1.0, "grad_norm": 0.4120984506821643, "kl": 0.521484375, "learning_rate": 1.6092143059206894e-05, "loss": 0.0208, "num_tokens": 992830532.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36272083297772467, "frac_reward_zero_std": 1.0, "grad_norm": 0.16550581857566254, "kl": 0.7041015625, "learning_rate": 1.6087417342052922e-05, "loss": 0.0281, "num_tokens": 993392276.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3628915251344201, "frac_reward_zero_std": 1.0, "grad_norm": 0.5990323065332638, "kl": 0.7265625, "learning_rate": 1.6082689464085538e-05, "loss": 0.0291, "num_tokens": 993953076.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3630622172911155, "frac_reward_zero_std": 1.0, "grad_norm": 0.2712380490479352, "kl": 0.578125, "learning_rate": 1.607795942698296e-05, "loss": 0.0231, "num_tokens": 994521604.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3632329094478109, "frac_reward_zero_std": 1.0, "grad_norm": 0.1018143224276811, "kl": 0.4716796875, "learning_rate": 1.6073227232424192e-05, "loss": 0.0189, "num_tokens": 995086932.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3634036016045063, "frac_reward_zero_std": 1.0, "grad_norm": 0.14820548025856714, "kl": 0.4169921875, "learning_rate": 1.606849288208899e-05, "loss": 0.0167, "num_tokens": 995654244.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36357429376120165, "frac_reward_zero_std": 1.0, "grad_norm": 0.15367980433821998, "kl": 0.280029296875, "learning_rate": 1.6063756377657873e-05, "loss": 0.0112, "num_tokens": 996215268.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36374498591789706, "frac_reward_zero_std": 1.0, "grad_norm": 0.12101959291257718, "kl": 0.226318359375, "learning_rate": 1.6059017720812126e-05, "loss": 0.0091, "num_tokens": 996778452.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36391567807459246, "frac_reward_zero_std": 1.0, "grad_norm": 0.05102347241302212, "kl": 0.1011962890625, "learning_rate": 1.6054276913233814e-05, "loss": 0.004, "num_tokens": 997341156.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36408637023128787, "frac_reward_zero_std": 1.0, "grad_norm": 0.01444133938258459, "kl": 0.0670166015625, "learning_rate": 1.604953395660574e-05, "loss": 0.0027, "num_tokens": 997904164.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3642570623879833, "frac_reward_zero_std": 1.0, "grad_norm": 0.019490020343081848, "kl": 0.05145263671875, "learning_rate": 1.604478885261149e-05, "loss": 0.0021, "num_tokens": 998475572.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3644277545446787, "frac_reward_zero_std": 1.0, "grad_norm": 0.034135060322420765, "kl": 0.06427001953125, "learning_rate": 1.6040041602935397e-05, "loss": 0.0026, "num_tokens": 999036420.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3645984467013741, "frac_reward_zero_std": 1.0, "grad_norm": 0.06201868259725143, "kl": 0.06341552734375, "learning_rate": 1.603529220926257e-05, "loss": 0.0025, "num_tokens": 999606676.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3647691388580695, "frac_reward_zero_std": 1.0, "grad_norm": 0.02502041818369304, "kl": 0.05126953125, "learning_rate": 1.603054067327887e-05, "loss": 0.0021, "num_tokens": 1000180436.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36493983101476485, "frac_reward_zero_std": 1.0, "grad_norm": 0.026128605409476525, "kl": 0.06591796875, "learning_rate": 1.6025786996670923e-05, "loss": 0.0026, "num_tokens": 1000742068.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36511052317146026, "frac_reward_zero_std": 1.0, "grad_norm": 0.1062368554837246, "kl": 0.0650634765625, "learning_rate": 1.602103118112611e-05, "loss": 0.0026, "num_tokens": 1001307332.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36528121532815566, "frac_reward_zero_std": 1.0, "grad_norm": 0.014117150819844066, "kl": 0.04766845703125, "learning_rate": 1.601627322833257e-05, "loss": 0.0019, "num_tokens": 1001874756.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36545190748485107, "frac_reward_zero_std": 1.0, "grad_norm": 0.010311069233587569, "kl": 0.0435791015625, "learning_rate": 1.6011513139979214e-05, "loss": 0.0017, "num_tokens": 1002446516.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3656225996415465, "frac_reward_zero_std": 1.0, "grad_norm": 0.07321456610034124, "kl": 0.11492919921875, "learning_rate": 1.60067509177557e-05, "loss": 0.0046, "num_tokens": 1003013476.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3657932917982419, "frac_reward_zero_std": 1.0, "grad_norm": 0.008906327573562649, "kl": 0.0479736328125, "learning_rate": 1.6001986563352433e-05, "loss": 0.0019, "num_tokens": 1003581092.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3659639839549373, "frac_reward_zero_std": 1.0, "grad_norm": 0.01637804978689717, "kl": 0.060302734375, "learning_rate": 1.59972200784606e-05, "loss": 0.0024, "num_tokens": 1004145412.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3661346761116327, "frac_reward_zero_std": 1.0, "grad_norm": 0.029897307625979565, "kl": 0.06280517578125, "learning_rate": 1.5992451464772124e-05, "loss": 0.0025, "num_tokens": 1004721844.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36630536826832805, "frac_reward_zero_std": 1.0, "grad_norm": 0.06460540950832162, "kl": 0.08343505859375, "learning_rate": 1.59876807239797e-05, "loss": 0.0033, "num_tokens": 1005283940.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36647606042502345, "frac_reward_zero_std": 1.0, "grad_norm": 0.05872064864622787, "kl": 0.08856201171875, "learning_rate": 1.5982907857776752e-05, "loss": 0.0035, "num_tokens": 1005845396.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36664675258171886, "frac_reward_zero_std": 1.0, "grad_norm": 0.03657488632121797, "kl": 0.0802001953125, "learning_rate": 1.597813286785749e-05, "loss": 0.0032, "num_tokens": 1006409316.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36681744473841427, "frac_reward_zero_std": 1.0, "grad_norm": 0.05682734799581929, "kl": 0.0953369140625, "learning_rate": 1.5973355755916856e-05, "loss": 0.0038, "num_tokens": 1006973220.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3669881368951097, "frac_reward_zero_std": 1.0, "grad_norm": 0.026113952977689883, "kl": 0.109130859375, "learning_rate": 1.5968576523650556e-05, "loss": 0.0044, "num_tokens": 1007540468.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3671588290518051, "frac_reward_zero_std": 1.0, "grad_norm": 0.0553172156065384, "kl": 0.11474609375, "learning_rate": 1.5963795172755044e-05, "loss": 0.0046, "num_tokens": 1008111332.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3673295212085005, "frac_reward_zero_std": 1.0, "grad_norm": 0.05762569224552678, "kl": 0.1243896484375, "learning_rate": 1.5959011704927525e-05, "loss": 0.005, "num_tokens": 1008674740.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3675002133651959, "frac_reward_zero_std": 1.0, "grad_norm": 0.018913922640176523, "kl": 0.1231689453125, "learning_rate": 1.595422612186596e-05, "loss": 0.0049, "num_tokens": 1009232644.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36767090552189124, "frac_reward_zero_std": 1.0, "grad_norm": 0.05957429832999007, "kl": 0.207763671875, "learning_rate": 1.5949438425269053e-05, "loss": 0.0083, "num_tokens": 1009796452.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36784159767858665, "frac_reward_zero_std": 1.0, "grad_norm": 0.021446771793478692, "kl": 0.1715087890625, "learning_rate": 1.594464861683627e-05, "loss": 0.0069, "num_tokens": 1010358468.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36801228983528206, "frac_reward_zero_std": 1.0, "grad_norm": 0.05208504384978378, "kl": 0.25927734375, "learning_rate": 1.593985669826782e-05, "loss": 0.0104, "num_tokens": 1010917988.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36818298199197746, "frac_reward_zero_std": 1.0, "grad_norm": 0.20378913690510114, "kl": 0.261962890625, "learning_rate": 1.5935062671264655e-05, "loss": 0.0105, "num_tokens": 1011493860.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36835367414867287, "frac_reward_zero_std": 1.0, "grad_norm": 0.0991427525328935, "kl": 0.2529296875, "learning_rate": 1.593026653752849e-05, "loss": 0.0101, "num_tokens": 1012056788.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3685243663053683, "frac_reward_zero_std": 1.0, "grad_norm": 0.23725442672263708, "kl": 0.251953125, "learning_rate": 1.592546829876177e-05, "loss": 0.0101, "num_tokens": 1012622228.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3686950584620637, "frac_reward_zero_std": 1.0, "grad_norm": 0.17454325513367208, "kl": 0.404296875, "learning_rate": 1.5920667956667703e-05, "loss": 0.0162, "num_tokens": 1013192580.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3688657506187591, "frac_reward_zero_std": 1.0, "grad_norm": 0.5693725151017952, "kl": 0.5185546875, "learning_rate": 1.5915865512950236e-05, "loss": 0.0208, "num_tokens": 1013762692.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36903644277545444, "frac_reward_zero_std": 0.9375, "grad_norm": 0.9310250632950315, "kl": 0.6630859375, "learning_rate": 1.591106096931406e-05, "loss": 0.0265, "num_tokens": 1014324276.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36920713493214985, "frac_reward_zero_std": 1.0, "grad_norm": 0.13144824377292844, "kl": 0.76953125, "learning_rate": 1.5906254327464616e-05, "loss": 0.0307, "num_tokens": 1014885252.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36937782708884526, "frac_reward_zero_std": 1.0, "grad_norm": 1.3055078999289442, "kl": 0.8779296875, "learning_rate": 1.5901445589108094e-05, "loss": 0.0351, "num_tokens": 1015449252.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36954851924554066, "frac_reward_zero_std": 1.0, "grad_norm": 1.8992227886196043, "kl": 0.8427734375, "learning_rate": 1.5896634755951416e-05, "loss": 0.0337, "num_tokens": 1016011556.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36971921140223607, "frac_reward_zero_std": 1.0, "grad_norm": 0.9153701096394643, "kl": 0.771484375, "learning_rate": 1.5891821829702253e-05, "loss": 0.0308, "num_tokens": 1016582100.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3698899035589315, "frac_reward_zero_std": 1.0, "grad_norm": 0.6200212324074313, "kl": 0.6962890625, "learning_rate": 1.5887006812069025e-05, "loss": 0.0279, "num_tokens": 1017141268.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3700605957156269, "frac_reward_zero_std": 1.0, "grad_norm": 0.16180372628198283, "kl": 0.552734375, "learning_rate": 1.5882189704760887e-05, "loss": 0.0221, "num_tokens": 1017707748.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3702312878723223, "frac_reward_zero_std": 1.0, "grad_norm": 0.5537599726229181, "kl": 0.42041015625, "learning_rate": 1.587737050948774e-05, "loss": 0.0168, "num_tokens": 1018275396.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37040198002901764, "frac_reward_zero_std": 1.0, "grad_norm": 0.8322871961569519, "kl": 0.46337890625, "learning_rate": 1.587254922796022e-05, "loss": 0.0185, "num_tokens": 1018840004.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37057267218571305, "frac_reward_zero_std": 1.0, "grad_norm": 0.3290395083377026, "kl": 0.380859375, "learning_rate": 1.5867725861889714e-05, "loss": 0.0152, "num_tokens": 1019405028.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37074336434240845, "frac_reward_zero_std": 1.0, "grad_norm": 0.14336258030036264, "kl": 0.37548828125, "learning_rate": 1.586290041298834e-05, "loss": 0.015, "num_tokens": 1019972372.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37091405649910386, "frac_reward_zero_std": 1.0, "grad_norm": 0.31455619271852964, "kl": 0.39404296875, "learning_rate": 1.5858072882968957e-05, "loss": 0.0158, "num_tokens": 1020540820.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37108474865579927, "frac_reward_zero_std": 1.0, "grad_norm": 0.32942603955463773, "kl": 0.3466796875, "learning_rate": 1.5853243273545164e-05, "loss": 0.0139, "num_tokens": 1021106724.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3712554408124947, "frac_reward_zero_std": 1.0, "grad_norm": 0.3759342689115062, "kl": 0.38037109375, "learning_rate": 1.58484115864313e-05, "loss": 0.0152, "num_tokens": 1021667956.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3714261329691901, "frac_reward_zero_std": 1.0, "grad_norm": 0.28192697835757613, "kl": 0.3203125, "learning_rate": 1.584357782334244e-05, "loss": 0.0128, "num_tokens": 1022236100.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3715968251258855, "frac_reward_zero_std": 1.0, "grad_norm": 0.15896623346933675, "kl": 0.253173828125, "learning_rate": 1.583874198599439e-05, "loss": 0.0101, "num_tokens": 1022809348.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37176751728258084, "frac_reward_zero_std": 1.0, "grad_norm": 0.03131460522632245, "kl": 0.2978515625, "learning_rate": 1.5833904076103703e-05, "loss": 0.0119, "num_tokens": 1023370724.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37193820943927625, "frac_reward_zero_std": 1.0, "grad_norm": 0.29329371660471815, "kl": 0.30029296875, "learning_rate": 1.5829064095387662e-05, "loss": 0.012, "num_tokens": 1023936004.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37210890159597165, "frac_reward_zero_std": 1.0, "grad_norm": 0.5185990243504406, "kl": 0.3447265625, "learning_rate": 1.582422204556428e-05, "loss": 0.0138, "num_tokens": 1024499236.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37227959375266706, "frac_reward_zero_std": 1.0, "grad_norm": 0.17195068548669468, "kl": 0.2900390625, "learning_rate": 1.581937792835232e-05, "loss": 0.0116, "num_tokens": 1025068212.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37245028590936247, "frac_reward_zero_std": 1.0, "grad_norm": 0.07350853686510628, "kl": 0.3466796875, "learning_rate": 1.581453174547126e-05, "loss": 0.0139, "num_tokens": 1025634852.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3726209780660579, "frac_reward_zero_std": 1.0, "grad_norm": 0.21930684615007442, "kl": 0.4482421875, "learning_rate": 1.5809683498641322e-05, "loss": 0.0179, "num_tokens": 1026214356.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3727916702227533, "frac_reward_zero_std": 1.0, "grad_norm": 0.27362996407520407, "kl": 0.39697265625, "learning_rate": 1.580483318958346e-05, "loss": 0.0159, "num_tokens": 1026783060.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3729623623794487, "frac_reward_zero_std": 1.0, "grad_norm": 0.20635516457322117, "kl": 0.416015625, "learning_rate": 1.579998082001936e-05, "loss": 0.0167, "num_tokens": 1027348404.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37313305453614404, "frac_reward_zero_std": 1.0, "grad_norm": 0.3123174145941514, "kl": 0.42236328125, "learning_rate": 1.5795126391671435e-05, "loss": 0.0169, "num_tokens": 1027910692.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37330374669283944, "frac_reward_zero_std": 1.0, "grad_norm": 0.2843400277007046, "kl": 0.36181640625, "learning_rate": 1.5790269906262833e-05, "loss": 0.0145, "num_tokens": 1028476340.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37347443884953485, "frac_reward_zero_std": 1.0, "grad_norm": 0.1948158140364261, "kl": 0.32421875, "learning_rate": 1.578541136551743e-05, "loss": 0.013, "num_tokens": 1029042388.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37364513100623026, "frac_reward_zero_std": 1.0, "grad_norm": 0.2475444587134703, "kl": 0.32861328125, "learning_rate": 1.5780550771159835e-05, "loss": 0.0132, "num_tokens": 1029606948.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37381582316292566, "frac_reward_zero_std": 1.0, "grad_norm": 0.11610847899701611, "kl": 0.31298828125, "learning_rate": 1.5775688124915384e-05, "loss": 0.0125, "num_tokens": 1030174516.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37398651531962107, "frac_reward_zero_std": 1.0, "grad_norm": 0.10492715267418586, "kl": 0.32373046875, "learning_rate": 1.5770823428510134e-05, "loss": 0.013, "num_tokens": 1030740084.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3741572074763165, "frac_reward_zero_std": 1.0, "grad_norm": 0.13538111302423603, "kl": 0.2705078125, "learning_rate": 1.5765956683670887e-05, "loss": 0.0108, "num_tokens": 1031306516.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3743278996330119, "frac_reward_zero_std": 1.0, "grad_norm": 0.1840164309385316, "kl": 0.32470703125, "learning_rate": 1.576108789212515e-05, "loss": 0.013, "num_tokens": 1031863572.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37449859178970724, "frac_reward_zero_std": 1.0, "grad_norm": 0.15479129024339813, "kl": 0.28125, "learning_rate": 1.575621705560118e-05, "loss": 0.0113, "num_tokens": 1032428196.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37466928394640264, "frac_reward_zero_std": 1.0, "grad_norm": 0.047449566055400544, "kl": 0.27099609375, "learning_rate": 1.575134417582794e-05, "loss": 0.0108, "num_tokens": 1032992564.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37483997610309805, "frac_reward_zero_std": 1.0, "grad_norm": 0.1734765585407499, "kl": 0.265869140625, "learning_rate": 1.574646925453513e-05, "loss": 0.0106, "num_tokens": 1033559252.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37501066825979346, "frac_reward_zero_std": 1.0, "grad_norm": 0.06774165566827549, "kl": 0.261474609375, "learning_rate": 1.5741592293453175e-05, "loss": 0.0104, "num_tokens": 1034121988.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37518136041648886, "frac_reward_zero_std": 1.0, "grad_norm": 0.10826359596366453, "kl": 0.26953125, "learning_rate": 1.573671329431321e-05, "loss": 0.0108, "num_tokens": 1034695316.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37535205257318427, "frac_reward_zero_std": 1.0, "grad_norm": 0.14532718941930073, "kl": 0.329833984375, "learning_rate": 1.5731832258847107e-05, "loss": 0.0132, "num_tokens": 1035260836.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3755227447298797, "frac_reward_zero_std": 1.0, "grad_norm": 0.22311111420318167, "kl": 0.33642578125, "learning_rate": 1.5726949188787463e-05, "loss": 0.0135, "num_tokens": 1035827124.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3756934368865751, "frac_reward_zero_std": 1.0, "grad_norm": 0.12285864428619792, "kl": 0.3837890625, "learning_rate": 1.5722064085867587e-05, "loss": 0.0153, "num_tokens": 1036393028.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37586412904327043, "frac_reward_zero_std": 1.0, "grad_norm": 0.07221558511583852, "kl": 0.35302734375, "learning_rate": 1.5717176951821514e-05, "loss": 0.0141, "num_tokens": 1036955892.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37603482119996584, "frac_reward_zero_std": 1.0, "grad_norm": 0.21689688594151219, "kl": 0.36083984375, "learning_rate": 1.5712287788384006e-05, "loss": 0.0144, "num_tokens": 1037523460.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37620551335666125, "frac_reward_zero_std": 1.0, "grad_norm": 0.20204712379141984, "kl": 0.34912109375, "learning_rate": 1.570739659729053e-05, "loss": 0.014, "num_tokens": 1038090180.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37637620551335665, "frac_reward_zero_std": 1.0, "grad_norm": 0.20137508356190228, "kl": 0.39208984375, "learning_rate": 1.570250338027729e-05, "loss": 0.0157, "num_tokens": 1038650836.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37654689767005206, "frac_reward_zero_std": 1.0, "grad_norm": 0.1525746197661792, "kl": 0.37744140625, "learning_rate": 1.5697608139081196e-05, "loss": 0.0151, "num_tokens": 1039217140.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37671758982674747, "frac_reward_zero_std": 1.0, "grad_norm": 0.1493549110580505, "kl": 0.35498046875, "learning_rate": 1.5692710875439888e-05, "loss": 0.0142, "num_tokens": 1039787716.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3768882819834429, "frac_reward_zero_std": 1.0, "grad_norm": 0.40514466637393565, "kl": 0.40380859375, "learning_rate": 1.5687811591091713e-05, "loss": 0.0161, "num_tokens": 1040350916.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3770589741401383, "frac_reward_zero_std": 1.0, "grad_norm": 0.19884230217237547, "kl": 0.49365234375, "learning_rate": 1.568291028777574e-05, "loss": 0.0198, "num_tokens": 1040909796.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3772296662968337, "frac_reward_zero_std": 1.0, "grad_norm": 0.1958552357165228, "kl": 0.54833984375, "learning_rate": 1.5678006967231755e-05, "loss": 0.0219, "num_tokens": 1041476628.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37740035845352904, "frac_reward_zero_std": 1.0, "grad_norm": 0.25846436550648233, "kl": 0.611328125, "learning_rate": 1.5673101631200262e-05, "loss": 0.0245, "num_tokens": 1042040964.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37757105061022445, "frac_reward_zero_std": 1.0, "grad_norm": 0.6837615107442633, "kl": 0.689453125, "learning_rate": 1.566819428142248e-05, "loss": 0.0276, "num_tokens": 1042601636.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37774174276691985, "frac_reward_zero_std": 1.0, "grad_norm": 0.19430597158342938, "kl": 0.498046875, "learning_rate": 1.5663284919640336e-05, "loss": 0.0199, "num_tokens": 1043176132.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37791243492361526, "frac_reward_zero_std": 1.0, "grad_norm": 0.0454565864059217, "kl": 0.44287109375, "learning_rate": 1.565837354759648e-05, "loss": 0.0177, "num_tokens": 1043746116.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37808312708031067, "frac_reward_zero_std": 1.0, "grad_norm": 0.5433151148678977, "kl": 0.44921875, "learning_rate": 1.5653460167034266e-05, "loss": 0.018, "num_tokens": 1044307108.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3782538192370061, "frac_reward_zero_std": 1.0, "grad_norm": 0.2074378809366492, "kl": 0.33447265625, "learning_rate": 1.5648544779697773e-05, "loss": 0.0134, "num_tokens": 1044875476.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3784245113937015, "frac_reward_zero_std": 1.0, "grad_norm": 0.23047941142862652, "kl": 0.36376953125, "learning_rate": 1.5643627387331783e-05, "loss": 0.0146, "num_tokens": 1045440916.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3785952035503969, "frac_reward_zero_std": 1.0, "grad_norm": 0.08093781699612874, "kl": 0.337890625, "learning_rate": 1.5638707991681796e-05, "loss": 0.0135, "num_tokens": 1046005588.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37876589570709224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0701987924279865, "kl": 0.318359375, "learning_rate": 1.5633786594494015e-05, "loss": 0.0127, "num_tokens": 1046571668.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37893658786378764, "frac_reward_zero_std": 1.0, "grad_norm": 0.23926119637473287, "kl": 0.349609375, "learning_rate": 1.5628863197515365e-05, "loss": 0.014, "num_tokens": 1047139908.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37910728002048305, "frac_reward_zero_std": 1.0, "grad_norm": 0.19315065696671008, "kl": 0.3876953125, "learning_rate": 1.5623937802493467e-05, "loss": 0.0155, "num_tokens": 1047704708.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37927797217717846, "frac_reward_zero_std": 1.0, "grad_norm": 0.286574889457725, "kl": 0.46923828125, "learning_rate": 1.5619010411176662e-05, "loss": 0.0188, "num_tokens": 1048271908.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37944866433387386, "frac_reward_zero_std": 1.0, "grad_norm": 0.18817342124517086, "kl": 0.4873046875, "learning_rate": 1.5614081025313998e-05, "loss": 0.0195, "num_tokens": 1048836692.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37961935649056927, "frac_reward_zero_std": 1.0, "grad_norm": 0.04688493798314258, "kl": 0.50927734375, "learning_rate": 1.5609149646655225e-05, "loss": 0.0204, "num_tokens": 1049402388.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3797900486472647, "frac_reward_zero_std": 1.0, "grad_norm": 0.2507631123940852, "kl": 0.48828125, "learning_rate": 1.5604216276950804e-05, "loss": 0.0196, "num_tokens": 1049967588.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3799607408039601, "frac_reward_zero_std": 1.0, "grad_norm": 0.12334272597811864, "kl": 0.4736328125, "learning_rate": 1.5599280917951906e-05, "loss": 0.019, "num_tokens": 1050533588.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38013143296065544, "frac_reward_zero_std": 1.0, "grad_norm": 0.4526977208126307, "kl": 0.4873046875, "learning_rate": 1.5594343571410407e-05, "loss": 0.0195, "num_tokens": 1051095876.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38030212511735084, "frac_reward_zero_std": 1.0, "grad_norm": 0.12763413327869239, "kl": 0.458984375, "learning_rate": 1.5589404239078882e-05, "loss": 0.0184, "num_tokens": 1051655044.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38047281727404625, "frac_reward_zero_std": 1.0, "grad_norm": 0.06311843191972627, "kl": 0.3896484375, "learning_rate": 1.558446292271062e-05, "loss": 0.0156, "num_tokens": 1052221108.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38064350943074166, "frac_reward_zero_std": 1.0, "grad_norm": 0.12791045912252386, "kl": 0.35498046875, "learning_rate": 1.5579519624059605e-05, "loss": 0.0142, "num_tokens": 1052785876.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38081420158743706, "frac_reward_zero_std": 1.0, "grad_norm": 0.16713449185796506, "kl": 0.34375, "learning_rate": 1.5574574344880534e-05, "loss": 0.0137, "num_tokens": 1053352324.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38098489374413247, "frac_reward_zero_std": 1.0, "grad_norm": 0.20114022173150414, "kl": 0.3818359375, "learning_rate": 1.5569627086928804e-05, "loss": 0.0153, "num_tokens": 1053914484.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3811555859008279, "frac_reward_zero_std": 1.0, "grad_norm": 0.11762046715240593, "kl": 0.29150390625, "learning_rate": 1.5564677851960504e-05, "loss": 0.0117, "num_tokens": 1054485236.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3813262780575233, "frac_reward_zero_std": 1.0, "grad_norm": 0.07331797031804865, "kl": 0.294921875, "learning_rate": 1.5559726641732446e-05, "loss": 0.0118, "num_tokens": 1055057140.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38149697021421863, "frac_reward_zero_std": 1.0, "grad_norm": 0.186307762721446, "kl": 0.43017578125, "learning_rate": 1.5554773458002124e-05, "loss": 0.0172, "num_tokens": 1055620180.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38166766237091404, "frac_reward_zero_std": 1.0, "grad_norm": 0.04978377616485737, "kl": 0.482421875, "learning_rate": 1.5549818302527738e-05, "loss": 0.0193, "num_tokens": 1056184516.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38183835452760945, "frac_reward_zero_std": 1.0, "grad_norm": 0.07726049856261481, "kl": 0.6103515625, "learning_rate": 1.5544861177068193e-05, "loss": 0.0244, "num_tokens": 1056750740.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38200904668430485, "frac_reward_zero_std": 1.0, "grad_norm": 0.07253238942610238, "kl": 0.681640625, "learning_rate": 1.5539902083383088e-05, "loss": 0.0273, "num_tokens": 1057313076.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38217973884100026, "frac_reward_zero_std": 1.0, "grad_norm": 0.07980633959474871, "kl": 0.52734375, "learning_rate": 1.5534941023232723e-05, "loss": 0.0211, "num_tokens": 1057878532.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38235043099769567, "frac_reward_zero_std": 1.0, "grad_norm": 0.11747082785520611, "kl": 0.4736328125, "learning_rate": 1.5529977998378093e-05, "loss": 0.0189, "num_tokens": 1058443764.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3825211231543911, "frac_reward_zero_std": 1.0, "grad_norm": 0.058600044135385346, "kl": 0.396484375, "learning_rate": 1.5525013010580895e-05, "loss": 0.0159, "num_tokens": 1059010996.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3826918153110865, "frac_reward_zero_std": 1.0, "grad_norm": 0.08732517906878942, "kl": 0.365234375, "learning_rate": 1.552004606160352e-05, "loss": 0.0146, "num_tokens": 1059579748.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38286250746778183, "frac_reward_zero_std": 1.0, "grad_norm": 0.11867731040961502, "kl": 0.34619140625, "learning_rate": 1.5515077153209053e-05, "loss": 0.0139, "num_tokens": 1060140612.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38303319962447724, "frac_reward_zero_std": 1.0, "grad_norm": 0.11540540001102309, "kl": 0.349609375, "learning_rate": 1.551010628716128e-05, "loss": 0.014, "num_tokens": 1060705172.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38320389178117265, "frac_reward_zero_std": 1.0, "grad_norm": 0.08062017609402598, "kl": 0.37255859375, "learning_rate": 1.5505133465224683e-05, "loss": 0.0149, "num_tokens": 1061270996.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38337458393786805, "frac_reward_zero_std": 1.0, "grad_norm": 0.08934950643908479, "kl": 0.46923828125, "learning_rate": 1.5500158689164427e-05, "loss": 0.0188, "num_tokens": 1061838292.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38354527609456346, "frac_reward_zero_std": 1.0, "grad_norm": 0.062400099280210314, "kl": 0.44677734375, "learning_rate": 1.5495181960746377e-05, "loss": 0.0178, "num_tokens": 1062399956.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38371596825125887, "frac_reward_zero_std": 1.0, "grad_norm": 0.08169384622736102, "kl": 0.34326171875, "learning_rate": 1.54902032817371e-05, "loss": 0.0137, "num_tokens": 1062962676.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38388666040795427, "frac_reward_zero_std": 1.0, "grad_norm": 0.04154499944946119, "kl": 0.2392578125, "learning_rate": 1.548522265390384e-05, "loss": 0.0096, "num_tokens": 1063528004.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3840573525646497, "frac_reward_zero_std": 1.0, "grad_norm": 0.048000390495031936, "kl": 0.241455078125, "learning_rate": 1.5480240079014544e-05, "loss": 0.0097, "num_tokens": 1064094452.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38422804472134503, "frac_reward_zero_std": 1.0, "grad_norm": 0.0855380304154698, "kl": 0.271484375, "learning_rate": 1.547525555883785e-05, "loss": 0.0109, "num_tokens": 1064653604.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38439873687804044, "frac_reward_zero_std": 1.0, "grad_norm": 0.04674134771763267, "kl": 0.25341796875, "learning_rate": 1.5470269095143072e-05, "loss": 0.0101, "num_tokens": 1065218052.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38456942903473584, "frac_reward_zero_std": 1.0, "grad_norm": 0.07328253454216402, "kl": 0.28125, "learning_rate": 1.5465280689700234e-05, "loss": 0.0112, "num_tokens": 1065785316.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38474012119143125, "frac_reward_zero_std": 1.0, "grad_norm": 0.131349423162915, "kl": 0.322265625, "learning_rate": 1.5460290344280036e-05, "loss": 0.0129, "num_tokens": 1066354148.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38491081334812666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0741548225398214, "kl": 0.3095703125, "learning_rate": 1.5455298060653866e-05, "loss": 0.0124, "num_tokens": 1066922068.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38508150550482206, "frac_reward_zero_std": 1.0, "grad_norm": 0.03254585144437253, "kl": 0.29150390625, "learning_rate": 1.5450303840593815e-05, "loss": 0.0117, "num_tokens": 1067485156.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38525219766151747, "frac_reward_zero_std": 1.0, "grad_norm": 0.06146600764232836, "kl": 0.26708984375, "learning_rate": 1.5445307685872646e-05, "loss": 0.0107, "num_tokens": 1068048484.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3854228898182129, "frac_reward_zero_std": 1.0, "grad_norm": 0.03717400024387905, "kl": 0.275390625, "learning_rate": 1.544030959826381e-05, "loss": 0.011, "num_tokens": 1068608500.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38559358197490823, "frac_reward_zero_std": 1.0, "grad_norm": 0.10858629911961135, "kl": 0.269775390625, "learning_rate": 1.543530957954145e-05, "loss": 0.0108, "num_tokens": 1069180132.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38576427413160363, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0391501380496604, "kl": 0.27490234375, "learning_rate": 1.5430307631480395e-05, "loss": 0.011, "num_tokens": 1069750020.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38593496628829904, "frac_reward_zero_std": 1.0, "grad_norm": 0.03220338844364102, "kl": 0.3544921875, "learning_rate": 1.542530375585615e-05, "loss": 0.0142, "num_tokens": 1070311444.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38610565844499445, "frac_reward_zero_std": 1.0, "grad_norm": 0.21860628570487484, "kl": 0.41455078125, "learning_rate": 1.5420297954444918e-05, "loss": 0.0166, "num_tokens": 1070880452.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38627635060168986, "frac_reward_zero_std": 0.9375, "grad_norm": 0.08327507545707277, "kl": 0.45947265625, "learning_rate": 1.541529022902357e-05, "loss": 0.0184, "num_tokens": 1071447332.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38644704275838526, "frac_reward_zero_std": 0.9375, "grad_norm": 0.09102159235665055, "kl": 0.599609375, "learning_rate": 1.5410280581369675e-05, "loss": 0.024, "num_tokens": 1072010804.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38661773491508067, "frac_reward_zero_std": 1.0, "grad_norm": 0.036653063817451134, "kl": 0.50927734375, "learning_rate": 1.540526901326147e-05, "loss": 0.0204, "num_tokens": 1072578900.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3867884270717761, "frac_reward_zero_std": 0.9375, "grad_norm": 0.070661822404511, "kl": 0.48583984375, "learning_rate": 1.540025552647789e-05, "loss": 0.0194, "num_tokens": 1073151924.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3869591192284714, "frac_reward_zero_std": 1.0, "grad_norm": 0.1540199154754497, "kl": 0.521484375, "learning_rate": 1.5395240122798528e-05, "loss": 0.0209, "num_tokens": 1073715268.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38712981138516683, "frac_reward_zero_std": 1.0, "grad_norm": 0.03421398210223529, "kl": 0.43017578125, "learning_rate": 1.5390222804003688e-05, "loss": 0.0172, "num_tokens": 1074285540.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38730050354186224, "frac_reward_zero_std": 1.0, "grad_norm": 1.9919570151979786, "kl": 0.82861328125, "learning_rate": 1.5385203571874323e-05, "loss": 0.0331, "num_tokens": 1074857108.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38747119569855765, "frac_reward_zero_std": 1.0, "grad_norm": 0.1997149393423356, "kl": 0.80859375, "learning_rate": 1.5380182428192085e-05, "loss": 0.0324, "num_tokens": 1075424404.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 2041.12890625, "completions/mean_terminated_length": 289.0, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.38764188785525305, "frac_reward_zero_std": 1.0, "grad_norm": 0.2324608930869398, "kl": 1.111328125, "learning_rate": 1.5375159374739298e-05, "loss": 0.0444, "num_tokens": 1075994581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 2042.28125, "completions/mean_terminated_length": 584.0, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "epoch": 0.38781258001194846, "frac_reward_zero_std": 0.9375, "grad_norm": 0.3701589388460068, "kl": 1.451171875, "learning_rate": 1.537013441329897e-05, "loss": 0.058, "num_tokens": 1076558941.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38798327216864387, "frac_reward_zero_std": 1.0, "grad_norm": 0.1696709968612, "kl": 0.7626953125, "learning_rate": 1.536510754565477e-05, "loss": 0.0305, "num_tokens": 1077125325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3881539643253393, "frac_reward_zero_std": 1.0, "grad_norm": 0.07581193771356298, "kl": 0.390625, "learning_rate": 1.5360078773591066e-05, "loss": 0.0156, "num_tokens": 1077692973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3883246564820346, "frac_reward_zero_std": 1.0, "grad_norm": 0.10227624424653792, "kl": 0.29345703125, "learning_rate": 1.535504809889288e-05, "loss": 0.0118, "num_tokens": 1078262669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38849534863873003, "frac_reward_zero_std": 1.0, "grad_norm": 0.14757099860003742, "kl": 0.257080078125, "learning_rate": 1.535001552334593e-05, "loss": 0.0103, "num_tokens": 1078823101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38866604079542544, "frac_reward_zero_std": 1.0, "grad_norm": 0.043321021968521486, "kl": 0.1982421875, "learning_rate": 1.534498104873659e-05, "loss": 0.0079, "num_tokens": 1079386157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38883673295212084, "frac_reward_zero_std": 1.0, "grad_norm": 0.04042555584286618, "kl": 0.232421875, "learning_rate": 1.533994467685192e-05, "loss": 0.0093, "num_tokens": 1079947837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38900742510881625, "frac_reward_zero_std": 1.0, "grad_norm": 0.03081587081022355, "kl": 0.228271484375, "learning_rate": 1.533490640947965e-05, "loss": 0.0091, "num_tokens": 1080515805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38917811726551166, "frac_reward_zero_std": 1.0, "grad_norm": 0.02482100559852132, "kl": 0.212890625, "learning_rate": 1.5329866248408184e-05, "loss": 0.0085, "num_tokens": 1081079981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38934880942220707, "frac_reward_zero_std": 1.0, "grad_norm": 0.05224458209529651, "kl": 0.255615234375, "learning_rate": 1.5324824195426603e-05, "loss": 0.0102, "num_tokens": 1081648717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38951950157890247, "frac_reward_zero_std": 1.0, "grad_norm": 0.0975547353113757, "kl": 0.291015625, "learning_rate": 1.5319780252324644e-05, "loss": 0.0117, "num_tokens": 1082210077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3896901937355978, "frac_reward_zero_std": 1.0, "grad_norm": 0.09108696039135832, "kl": 0.263916015625, "learning_rate": 1.5314734420892725e-05, "loss": 0.0106, "num_tokens": 1082774349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38986088589229323, "frac_reward_zero_std": 1.0, "grad_norm": 0.06116279858464427, "kl": 0.255859375, "learning_rate": 1.5309686702921938e-05, "loss": 0.0102, "num_tokens": 1083335901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39003157804898864, "frac_reward_zero_std": 1.0, "grad_norm": 0.14434050998699072, "kl": 0.20849609375, "learning_rate": 1.5304637100204045e-05, "loss": 0.0083, "num_tokens": 1083903741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39020227020568404, "frac_reward_zero_std": 1.0, "grad_norm": 0.05761860868105969, "kl": 0.215087890625, "learning_rate": 1.5299585614531464e-05, "loss": 0.0086, "num_tokens": 1084480013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39037296236237945, "frac_reward_zero_std": 1.0, "grad_norm": 0.048553137550452485, "kl": 0.212158203125, "learning_rate": 1.5294532247697297e-05, "loss": 0.0085, "num_tokens": 1085052189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39054365451907486, "frac_reward_zero_std": 1.0, "grad_norm": 0.042609947484732555, "kl": 0.208984375, "learning_rate": 1.52894770014953e-05, "loss": 0.0084, "num_tokens": 1085624605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39071434667577026, "frac_reward_zero_std": 1.0, "grad_norm": 0.04138470956256804, "kl": 0.2177734375, "learning_rate": 1.5284419877719908e-05, "loss": 0.0087, "num_tokens": 1086187357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39088503883246567, "frac_reward_zero_std": 1.0, "grad_norm": 0.07579937095541064, "kl": 0.216796875, "learning_rate": 1.5279360878166218e-05, "loss": 0.0087, "num_tokens": 1086757133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.391055730989161, "frac_reward_zero_std": 1.0, "grad_norm": 0.05438982311498813, "kl": 0.247314453125, "learning_rate": 1.527430000462999e-05, "loss": 0.0099, "num_tokens": 1087332941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39122642314585643, "frac_reward_zero_std": 1.0, "grad_norm": 0.03162087894496486, "kl": 0.2158203125, "learning_rate": 1.526923725890765e-05, "loss": 0.0086, "num_tokens": 1087897277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39139711530255183, "frac_reward_zero_std": 1.0, "grad_norm": 0.0542828016622975, "kl": 0.208251953125, "learning_rate": 1.526417264279629e-05, "loss": 0.0083, "num_tokens": 1088459389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39156780745924724, "frac_reward_zero_std": 1.0, "grad_norm": 0.0412044748036244, "kl": 0.180908203125, "learning_rate": 1.525910615809367e-05, "loss": 0.0072, "num_tokens": 1089023645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39173849961594265, "frac_reward_zero_std": 1.0, "grad_norm": 0.03479825992887011, "kl": 0.220947265625, "learning_rate": 1.5254037806598208e-05, "loss": 0.0088, "num_tokens": 1089589821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39190919177263805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0616686934784142, "kl": 0.2060546875, "learning_rate": 1.5248967590108983e-05, "loss": 0.0082, "num_tokens": 1090158941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39207988392933346, "frac_reward_zero_std": 1.0, "grad_norm": 0.05105961637113314, "kl": 0.22412109375, "learning_rate": 1.524389551042574e-05, "loss": 0.009, "num_tokens": 1090723469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39225057608602887, "frac_reward_zero_std": 1.0, "grad_norm": 0.03386704698424953, "kl": 0.20947265625, "learning_rate": 1.5238821569348882e-05, "loss": 0.0084, "num_tokens": 1091290541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3924212682427242, "frac_reward_zero_std": 1.0, "grad_norm": 0.10971562176481235, "kl": 0.289306640625, "learning_rate": 1.523374576867948e-05, "loss": 0.0116, "num_tokens": 1091854749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3925919603994196, "frac_reward_zero_std": 1.0, "grad_norm": 0.14479159843938216, "kl": 0.37158203125, "learning_rate": 1.5228668110219257e-05, "loss": 0.0148, "num_tokens": 1092417645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39276265255611503, "frac_reward_zero_std": 1.0, "grad_norm": 0.06547639539839815, "kl": 0.36279296875, "learning_rate": 1.52235885957706e-05, "loss": 0.0145, "num_tokens": 1092983213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39293334471281044, "frac_reward_zero_std": 1.0, "grad_norm": 0.08148991595812168, "kl": 0.39892578125, "learning_rate": 1.5218507227136555e-05, "loss": 0.016, "num_tokens": 1093543341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39310403686950585, "frac_reward_zero_std": 1.0, "grad_norm": 0.027684290069495097, "kl": 0.328125, "learning_rate": 1.5213424006120816e-05, "loss": 0.0131, "num_tokens": 1094105597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39327472902620125, "frac_reward_zero_std": 1.0, "grad_norm": 0.026107630158615922, "kl": 0.28125, "learning_rate": 1.5208338934527751e-05, "loss": 0.0112, "num_tokens": 1094681005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39344542118289666, "frac_reward_zero_std": 1.0, "grad_norm": 0.08640411062147531, "kl": 0.32958984375, "learning_rate": 1.5203252014162373e-05, "loss": 0.0132, "num_tokens": 1095253757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39361611333959207, "frac_reward_zero_std": 1.0, "grad_norm": 0.2121863206893082, "kl": 0.392578125, "learning_rate": 1.5198163246830361e-05, "loss": 0.0157, "num_tokens": 1095817709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3937868054962875, "frac_reward_zero_std": 1.0, "grad_norm": 0.10007163892384321, "kl": 0.37890625, "learning_rate": 1.5193072634338033e-05, "loss": 0.0152, "num_tokens": 1096377069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3939574976529828, "frac_reward_zero_std": 1.0, "grad_norm": 0.044810400140906724, "kl": 0.3798828125, "learning_rate": 1.5187980178492384e-05, "loss": 0.0152, "num_tokens": 1096940205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39412818980967823, "frac_reward_zero_std": 1.0, "grad_norm": 0.07716439416071443, "kl": 0.35986328125, "learning_rate": 1.5182885881101043e-05, "loss": 0.0144, "num_tokens": 1097507757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39429888196637364, "frac_reward_zero_std": 1.0, "grad_norm": 0.13996844278319698, "kl": 0.36376953125, "learning_rate": 1.5177789743972311e-05, "loss": 0.0145, "num_tokens": 1098070461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39446957412306904, "frac_reward_zero_std": 1.0, "grad_norm": 0.06140668879133598, "kl": 0.36669921875, "learning_rate": 1.5172691768915124e-05, "loss": 0.0147, "num_tokens": 1098637565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39464026627976445, "frac_reward_zero_std": 1.0, "grad_norm": 0.12469850981670705, "kl": 0.328125, "learning_rate": 1.5167591957739083e-05, "loss": 0.0131, "num_tokens": 1099205229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39481095843645986, "frac_reward_zero_std": 1.0, "grad_norm": 0.07533810404790339, "kl": 0.246826171875, "learning_rate": 1.5162490312254436e-05, "loss": 0.0099, "num_tokens": 1099785757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39498165059315526, "frac_reward_zero_std": 1.0, "grad_norm": 0.10011398066833427, "kl": 0.27880859375, "learning_rate": 1.5157386834272082e-05, "loss": 0.0111, "num_tokens": 1100350861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39515234274985067, "frac_reward_zero_std": 1.0, "grad_norm": 0.10013836777273415, "kl": 0.26904296875, "learning_rate": 1.5152281525603577e-05, "loss": 0.0108, "num_tokens": 1100913501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.395323034906546, "frac_reward_zero_std": 1.0, "grad_norm": 0.06838412236980013, "kl": 0.26123046875, "learning_rate": 1.5147174388061113e-05, "loss": 0.0104, "num_tokens": 1101480509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39549372706324143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0510490987522591, "kl": 0.240966796875, "learning_rate": 1.5142065423457547e-05, "loss": 0.0096, "num_tokens": 1102042029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39566441921993684, "frac_reward_zero_std": 1.0, "grad_norm": 0.026764121352615494, "kl": 0.242919921875, "learning_rate": 1.5136954633606369e-05, "loss": 0.0097, "num_tokens": 1102607949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39583511137663224, "frac_reward_zero_std": 1.0, "grad_norm": 0.09050331677682587, "kl": 0.23291015625, "learning_rate": 1.5131842020321733e-05, "loss": 0.0093, "num_tokens": 1103173021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39600580353332765, "frac_reward_zero_std": 1.0, "grad_norm": 0.08040848320882583, "kl": 0.234130859375, "learning_rate": 1.5126727585418428e-05, "loss": 0.0094, "num_tokens": 1103747949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39617649569002306, "frac_reward_zero_std": 1.0, "grad_norm": 0.06777325845091217, "kl": 0.2919921875, "learning_rate": 1.5121611330711894e-05, "loss": 0.0117, "num_tokens": 1104309597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39634718784671846, "frac_reward_zero_std": 1.0, "grad_norm": 0.05518755015012343, "kl": 0.28515625, "learning_rate": 1.5116493258018219e-05, "loss": 0.0114, "num_tokens": 1104874173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39651788000341387, "frac_reward_zero_std": 1.0, "grad_norm": 0.049104434085471056, "kl": 0.34619140625, "learning_rate": 1.5111373369154128e-05, "loss": 0.0138, "num_tokens": 1105439837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3966885721601092, "frac_reward_zero_std": 1.0, "grad_norm": 0.172342438172236, "kl": 0.34814453125, "learning_rate": 1.5106251665937004e-05, "loss": 0.0139, "num_tokens": 1106010477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3968592643168046, "frac_reward_zero_std": 1.0, "grad_norm": 0.08047932502157275, "kl": 0.365234375, "learning_rate": 1.5101128150184866e-05, "loss": 0.0146, "num_tokens": 1106576525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39702995647350003, "frac_reward_zero_std": 1.0, "grad_norm": 0.1176961742064408, "kl": 0.3759765625, "learning_rate": 1.509600282371637e-05, "loss": 0.015, "num_tokens": 1107140189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39720064863019544, "frac_reward_zero_std": 1.0, "grad_norm": 0.1492366381333995, "kl": 0.3955078125, "learning_rate": 1.5090875688350828e-05, "loss": 0.0158, "num_tokens": 1107705629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39737134078689085, "frac_reward_zero_std": 1.0, "grad_norm": 0.15584159944271148, "kl": 0.326171875, "learning_rate": 1.5085746745908187e-05, "loss": 0.013, "num_tokens": 1108271885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39754203294358625, "frac_reward_zero_std": 1.0, "grad_norm": 0.18916362128381742, "kl": 0.225830078125, "learning_rate": 1.5080615998209033e-05, "loss": 0.009, "num_tokens": 1108834285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39771272510028166, "frac_reward_zero_std": 1.0, "grad_norm": 0.13465785671817426, "kl": 0.141845703125, "learning_rate": 1.5075483447074608e-05, "loss": 0.0057, "num_tokens": 1109394989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39788341725697707, "frac_reward_zero_std": 1.0, "grad_norm": 0.06576663916665375, "kl": 0.0872802734375, "learning_rate": 1.5070349094326766e-05, "loss": 0.0035, "num_tokens": 1109961773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3980541094136724, "frac_reward_zero_std": 1.0, "grad_norm": 0.024232764981961554, "kl": 0.0567626953125, "learning_rate": 1.506521294178803e-05, "loss": 0.0023, "num_tokens": 1110525085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3982248015703678, "frac_reward_zero_std": 1.0, "grad_norm": 0.01890071857566116, "kl": 0.04449462890625, "learning_rate": 1.5060074991281537e-05, "loss": 0.0018, "num_tokens": 1111092909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39839549372706323, "frac_reward_zero_std": 1.0, "grad_norm": 0.027174030520731807, "kl": 0.04071044921875, "learning_rate": 1.5054935244631085e-05, "loss": 0.0016, "num_tokens": 1111657261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39856618588375864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0239728104777235, "kl": 0.03802490234375, "learning_rate": 1.5049793703661091e-05, "loss": 0.0015, "num_tokens": 1112219085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39873687804045405, "frac_reward_zero_std": 1.0, "grad_norm": 0.023155416600074405, "kl": 0.0350341796875, "learning_rate": 1.5044650370196622e-05, "loss": 0.0014, "num_tokens": 1112786189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39890757019714945, "frac_reward_zero_std": 1.0, "grad_norm": 0.018959529306518175, "kl": 0.03314208984375, "learning_rate": 1.5039505246063372e-05, "loss": 0.0013, "num_tokens": 1113352333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39907826235384486, "frac_reward_zero_std": 1.0, "grad_norm": 0.04694171330129126, "kl": 0.03759765625, "learning_rate": 1.5034358333087678e-05, "loss": 0.0015, "num_tokens": 1113914445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39924895451054027, "frac_reward_zero_std": 1.0, "grad_norm": 0.034210949729535554, "kl": 0.0338134765625, "learning_rate": 1.50292096330965e-05, "loss": 0.0014, "num_tokens": 1114480973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3994196466672356, "frac_reward_zero_std": 1.0, "grad_norm": 0.04206420241588996, "kl": 0.033599853515625, "learning_rate": 1.5024059147917448e-05, "loss": 0.0013, "num_tokens": 1115043309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.399590338823931, "frac_reward_zero_std": 1.0, "grad_norm": 0.13126113368867234, "kl": 0.0487060546875, "learning_rate": 1.5018906879378761e-05, "loss": 0.0019, "num_tokens": 1115607741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39976103098062643, "frac_reward_zero_std": 1.0, "grad_norm": 0.03290796301673703, "kl": 0.03094482421875, "learning_rate": 1.5013752829309298e-05, "loss": 0.0012, "num_tokens": 1116178045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39993172313732184, "frac_reward_zero_std": 1.0, "grad_norm": 0.023756030825569763, "kl": 0.029693603515625, "learning_rate": 1.500859699953857e-05, "loss": 0.0012, "num_tokens": 1116745709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40010241529401724, "frac_reward_zero_std": 1.0, "grad_norm": 0.01622131692012049, "kl": 0.02911376953125, "learning_rate": 1.5003439391896706e-05, "loss": 0.0012, "num_tokens": 1117315549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40027310745071265, "frac_reward_zero_std": 1.0, "grad_norm": 0.022614583608501104, "kl": 0.0311279296875, "learning_rate": 1.4998280008214473e-05, "loss": 0.0012, "num_tokens": 1117884557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40044379960740806, "frac_reward_zero_std": 1.0, "grad_norm": 0.03200030140843079, "kl": 0.034423828125, "learning_rate": 1.4993118850323265e-05, "loss": 0.0014, "num_tokens": 1118451069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40061449176410346, "frac_reward_zero_std": 1.0, "grad_norm": 0.021325509427825722, "kl": 0.032073974609375, "learning_rate": 1.4987955920055107e-05, "loss": 0.0013, "num_tokens": 1119018509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4007851839207988, "frac_reward_zero_std": 1.0, "grad_norm": 0.04385215865066867, "kl": 0.037445068359375, "learning_rate": 1.4982791219242647e-05, "loss": 0.0015, "num_tokens": 1119577885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4009558760774942, "frac_reward_zero_std": 1.0, "grad_norm": 0.03197637775030946, "kl": 0.03509521484375, "learning_rate": 1.497762474971918e-05, "loss": 0.0014, "num_tokens": 1120142557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40112656823418963, "frac_reward_zero_std": 1.0, "grad_norm": 0.02473717208233853, "kl": 0.044189453125, "learning_rate": 1.4972456513318602e-05, "loss": 0.0018, "num_tokens": 1120706413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40129726039088504, "frac_reward_zero_std": 1.0, "grad_norm": 0.027317170442612115, "kl": 0.03662109375, "learning_rate": 1.4967286511875462e-05, "loss": 0.0015, "num_tokens": 1121274605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40146795254758044, "frac_reward_zero_std": 1.0, "grad_norm": 0.028212078539094552, "kl": 0.03814697265625, "learning_rate": 1.4962114747224916e-05, "loss": 0.0015, "num_tokens": 1121834829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40163864470427585, "frac_reward_zero_std": 1.0, "grad_norm": 0.019391927870650295, "kl": 0.0380859375, "learning_rate": 1.4956941221202757e-05, "loss": 0.0015, "num_tokens": 1122399373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40180933686097126, "frac_reward_zero_std": 1.0, "grad_norm": 0.027959512009818183, "kl": 0.04180908203125, "learning_rate": 1.4951765935645401e-05, "loss": 0.0017, "num_tokens": 1122963453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40198002901766666, "frac_reward_zero_std": 1.0, "grad_norm": 0.030992294734142452, "kl": 0.04339599609375, "learning_rate": 1.4946588892389887e-05, "loss": 0.0017, "num_tokens": 1123530685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.402150721174362, "frac_reward_zero_std": 1.0, "grad_norm": 0.031134485929226956, "kl": 0.04644775390625, "learning_rate": 1.4941410093273875e-05, "loss": 0.0019, "num_tokens": 1124097821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4023214133310574, "frac_reward_zero_std": 1.0, "grad_norm": 0.06064756325491097, "kl": 0.05810546875, "learning_rate": 1.4936229540135659e-05, "loss": 0.0023, "num_tokens": 1124660589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4024921054877528, "frac_reward_zero_std": 1.0, "grad_norm": 0.027680963068440933, "kl": 0.06549072265625, "learning_rate": 1.493104723481414e-05, "loss": 0.0026, "num_tokens": 1125227533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40266279764444823, "frac_reward_zero_std": 1.0, "grad_norm": 0.02765970017922979, "kl": 0.08721923828125, "learning_rate": 1.4925863179148853e-05, "loss": 0.0035, "num_tokens": 1125793325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40283348980114364, "frac_reward_zero_std": 1.0, "grad_norm": 0.04012586989824767, "kl": 0.105224609375, "learning_rate": 1.4920677374979952e-05, "loss": 0.0042, "num_tokens": 1126360013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40300418195783905, "frac_reward_zero_std": 1.0, "grad_norm": 0.044411791784377284, "kl": 0.103759765625, "learning_rate": 1.491548982414821e-05, "loss": 0.0042, "num_tokens": 1126922637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40317487411453445, "frac_reward_zero_std": 1.0, "grad_norm": 0.09840745997442114, "kl": 0.1417236328125, "learning_rate": 1.491030052849502e-05, "loss": 0.0057, "num_tokens": 1127483485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40334556627122986, "frac_reward_zero_std": 1.0, "grad_norm": 0.04275459682321592, "kl": 0.173828125, "learning_rate": 1.4905109489862393e-05, "loss": 0.007, "num_tokens": 1128048605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4035162584279252, "frac_reward_zero_std": 1.0, "grad_norm": 0.04092406848975906, "kl": 0.2138671875, "learning_rate": 1.4899916710092961e-05, "loss": 0.0086, "num_tokens": 1128614493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4036869505846206, "frac_reward_zero_std": 1.0, "grad_norm": 0.2220439433479162, "kl": 0.29931640625, "learning_rate": 1.4894722191029974e-05, "loss": 0.012, "num_tokens": 1129181773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.403857642741316, "frac_reward_zero_std": 1.0, "grad_norm": 0.10077542097103394, "kl": 0.365234375, "learning_rate": 1.4889525934517299e-05, "loss": 0.0146, "num_tokens": 1129748093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40402833489801143, "frac_reward_zero_std": 1.0, "grad_norm": 0.1585314078005914, "kl": 0.52685546875, "learning_rate": 1.4884327942399419e-05, "loss": 0.0211, "num_tokens": 1130310381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40419902705470684, "frac_reward_zero_std": 1.0, "grad_norm": 0.24309827834849068, "kl": 0.650390625, "learning_rate": 1.4879128216521433e-05, "loss": 0.026, "num_tokens": 1130881661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40436971921140225, "frac_reward_zero_std": 1.0, "grad_norm": 0.3810462017924199, "kl": 0.6513671875, "learning_rate": 1.4873926758729056e-05, "loss": 0.026, "num_tokens": 1131454733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40454041136809765, "frac_reward_zero_std": 1.0, "grad_norm": 0.27758617138409325, "kl": 0.5068359375, "learning_rate": 1.4868723570868619e-05, "loss": 0.0203, "num_tokens": 1132021341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40471110352479306, "frac_reward_zero_std": 1.0, "grad_norm": 0.1350887746071468, "kl": 0.3056640625, "learning_rate": 1.4863518654787067e-05, "loss": 0.0122, "num_tokens": 1132583773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4048817956814884, "frac_reward_zero_std": 1.0, "grad_norm": 0.048094884325665915, "kl": 0.179931640625, "learning_rate": 1.485831201233195e-05, "loss": 0.0072, "num_tokens": 1133154541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4050524878381838, "frac_reward_zero_std": 1.0, "grad_norm": 0.04086526423251642, "kl": 0.1253662109375, "learning_rate": 1.485310364535145e-05, "loss": 0.005, "num_tokens": 1133725133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4052231799948792, "frac_reward_zero_std": 1.0, "grad_norm": 0.06693929851146588, "kl": 0.1024169921875, "learning_rate": 1.4847893555694337e-05, "loss": 0.0041, "num_tokens": 1134288253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40539387215157463, "frac_reward_zero_std": 1.0, "grad_norm": 0.15686976635355906, "kl": 0.0897216796875, "learning_rate": 1.4842681745210016e-05, "loss": 0.0036, "num_tokens": 1134853517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40556456430827004, "frac_reward_zero_std": 1.0, "grad_norm": 0.06513283821630414, "kl": 0.0672607421875, "learning_rate": 1.4837468215748483e-05, "loss": 0.0027, "num_tokens": 1135415117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40573525646496544, "frac_reward_zero_std": 1.0, "grad_norm": 0.15051862659774148, "kl": 0.07611083984375, "learning_rate": 1.4832252969160359e-05, "loss": 0.003, "num_tokens": 1135984557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40590594862166085, "frac_reward_zero_std": 1.0, "grad_norm": 0.01219324448210868, "kl": 0.04315185546875, "learning_rate": 1.482703600729686e-05, "loss": 0.0017, "num_tokens": 1136548797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40607664077835626, "frac_reward_zero_std": 1.0, "grad_norm": 0.010116147321719848, "kl": 0.04132080078125, "learning_rate": 1.482181733200983e-05, "loss": 0.0017, "num_tokens": 1137110973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4062473329350516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0443177375676873, "kl": 0.045166015625, "learning_rate": 1.48165969451517e-05, "loss": 0.0018, "num_tokens": 1137671837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.406418025091747, "frac_reward_zero_std": 1.0, "grad_norm": 0.010911838432506035, "kl": 0.037353515625, "learning_rate": 1.4811374848575529e-05, "loss": 0.0015, "num_tokens": 1138237005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4065887172484424, "frac_reward_zero_std": 1.0, "grad_norm": 0.022435680850041103, "kl": 0.03857421875, "learning_rate": 1.4806151044134962e-05, "loss": 0.0015, "num_tokens": 1138796285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40675940940513783, "frac_reward_zero_std": 1.0, "grad_norm": 0.010370537682302708, "kl": 0.035888671875, "learning_rate": 1.480092553368427e-05, "loss": 0.0014, "num_tokens": 1139357453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40693010156183324, "frac_reward_zero_std": 1.0, "grad_norm": 0.01388180876642019, "kl": 0.03564453125, "learning_rate": 1.4795698319078315e-05, "loss": 0.0014, "num_tokens": 1139924461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40710079371852864, "frac_reward_zero_std": 1.0, "grad_norm": 0.026210078265133795, "kl": 0.03729248046875, "learning_rate": 1.4790469402172574e-05, "loss": 0.0015, "num_tokens": 1140493037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40727148587522405, "frac_reward_zero_std": 1.0, "grad_norm": 0.30352837766724255, "kl": 0.08209228515625, "learning_rate": 1.478523878482312e-05, "loss": 0.0033, "num_tokens": 1141052461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40744217803191946, "frac_reward_zero_std": 1.0, "grad_norm": 0.017332177500583044, "kl": 0.035888671875, "learning_rate": 1.4780006468886632e-05, "loss": 0.0014, "num_tokens": 1141614301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4076128701886148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01874798741387259, "kl": 0.0372314453125, "learning_rate": 1.4774772456220398e-05, "loss": 0.0015, "num_tokens": 1142179325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4077835623453102, "frac_reward_zero_std": 1.0, "grad_norm": 0.027965683127705043, "kl": 0.042724609375, "learning_rate": 1.4769536748682303e-05, "loss": 0.0017, "num_tokens": 1142754221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4079542545020056, "frac_reward_zero_std": 1.0, "grad_norm": 0.06508427791148008, "kl": 0.0516357421875, "learning_rate": 1.476429934813083e-05, "loss": 0.0021, "num_tokens": 1143316797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.408124946658701, "frac_reward_zero_std": 1.0, "grad_norm": 0.015209255547356441, "kl": 0.04296875, "learning_rate": 1.475906025642507e-05, "loss": 0.0017, "num_tokens": 1143885997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40829563881539643, "frac_reward_zero_std": 1.0, "grad_norm": 0.018226913874243956, "kl": 0.04486083984375, "learning_rate": 1.4753819475424714e-05, "loss": 0.0018, "num_tokens": 1144448205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40846633097209184, "frac_reward_zero_std": 1.0, "grad_norm": 0.014903863365850503, "kl": 0.0478515625, "learning_rate": 1.4748577006990043e-05, "loss": 0.0019, "num_tokens": 1145019981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40863702312878725, "frac_reward_zero_std": 1.0, "grad_norm": 0.040512423841191435, "kl": 0.07183837890625, "learning_rate": 1.474333285298195e-05, "loss": 0.0029, "num_tokens": 1145587613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40880771528548265, "frac_reward_zero_std": 1.0, "grad_norm": 0.023403790133030903, "kl": 0.066162109375, "learning_rate": 1.473808701526192e-05, "loss": 0.0026, "num_tokens": 1146153725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.408978407442178, "frac_reward_zero_std": 1.0, "grad_norm": 0.05819105571434881, "kl": 0.1053466796875, "learning_rate": 1.4732839495692035e-05, "loss": 0.0042, "num_tokens": 1146717485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4091490995988734, "frac_reward_zero_std": 1.0, "grad_norm": 0.043814330492349854, "kl": 0.0953369140625, "learning_rate": 1.4727590296134975e-05, "loss": 0.0038, "num_tokens": 1147277117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4093197917555688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0597248371767382, "kl": 0.111328125, "learning_rate": 1.4722339418454015e-05, "loss": 0.0045, "num_tokens": 1147841149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4094904839122642, "frac_reward_zero_std": 1.0, "grad_norm": 0.07136029146546123, "kl": 0.1231689453125, "learning_rate": 1.4717086864513026e-05, "loss": 0.0049, "num_tokens": 1148408925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40966117606895963, "frac_reward_zero_std": 1.0, "grad_norm": 0.08042693920248827, "kl": 0.1448974609375, "learning_rate": 1.471183263617648e-05, "loss": 0.0058, "num_tokens": 1148975517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40983186822565504, "frac_reward_zero_std": 1.0, "grad_norm": 0.07552100502967378, "kl": 0.176025390625, "learning_rate": 1.4706576735309435e-05, "loss": 0.007, "num_tokens": 1149540109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41000256038235044, "frac_reward_zero_std": 1.0, "grad_norm": 0.18458384427805571, "kl": 0.23486328125, "learning_rate": 1.4701319163777544e-05, "loss": 0.0094, "num_tokens": 1150103901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41017325253904585, "frac_reward_zero_std": 1.0, "grad_norm": 0.45784683100837037, "kl": 0.24365234375, "learning_rate": 1.4696059923447062e-05, "loss": 0.0098, "num_tokens": 1150669101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41034394469574126, "frac_reward_zero_std": 1.0, "grad_norm": 0.83734239843143, "kl": 0.256103515625, "learning_rate": 1.469079901618482e-05, "loss": 0.0102, "num_tokens": 1151234397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1504.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 37.0, "completions/min_terminated_length": 0.0, "epoch": 0.4105146368524366, "frac_reward_zero_std": 1.0, "grad_norm": 2.363768605199893, "kl": 2.13671875, "learning_rate": 1.4685536443858262e-05, "loss": 0.0856, "num_tokens": 1151662877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 5.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 3.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 2.0, "completions/min_terminated_length": 0.0, "epoch": 0.410685329009132, "frac_reward_zero_std": 1.0, "grad_norm": 14.787506727398366, "kl": 13.765625, "learning_rate": 1.4680272208335398e-05, "loss": 0.5512, "num_tokens": 1151707453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4108560211658274, "frac_reward_zero_std": 1.0, "grad_norm": 1.608281079005399, "kl": 1.0068359375, "learning_rate": 1.4675006311484854e-05, "loss": 0.0403, "num_tokens": 1152271677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41102671332252283, "frac_reward_zero_std": 1.0, "grad_norm": 0.3528046830481036, "kl": 0.3984375, "learning_rate": 1.4669738755175822e-05, "loss": 0.0159, "num_tokens": 1152840301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41119740547921824, "frac_reward_zero_std": 1.0, "grad_norm": 0.14630077407159847, "kl": 0.174560546875, "learning_rate": 1.4664469541278106e-05, "loss": 0.007, "num_tokens": 1153402141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41136809763591364, "frac_reward_zero_std": 1.0, "grad_norm": 0.1302531011747498, "kl": 0.0980224609375, "learning_rate": 1.4659198671662073e-05, "loss": 0.0039, "num_tokens": 1153972813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41153878979260905, "frac_reward_zero_std": 1.0, "grad_norm": 0.05696297666183417, "kl": 0.072509765625, "learning_rate": 1.4653926148198705e-05, "loss": 0.0029, "num_tokens": 1154537821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41170948194930446, "frac_reward_zero_std": 1.0, "grad_norm": 0.02788377489166307, "kl": 0.06085205078125, "learning_rate": 1.4648651972759549e-05, "loss": 0.0024, "num_tokens": 1155103677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4118801741059998, "frac_reward_zero_std": 1.0, "grad_norm": 0.022439230023110743, "kl": 0.05230712890625, "learning_rate": 1.4643376147216751e-05, "loss": 0.0021, "num_tokens": 1155666445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4120508662626952, "frac_reward_zero_std": 1.0, "grad_norm": 0.014438129402244346, "kl": 0.04290771484375, "learning_rate": 1.4638098673443036e-05, "loss": 0.0017, "num_tokens": 1156236653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4122215584193906, "frac_reward_zero_std": 1.0, "grad_norm": 0.03830337065573127, "kl": 0.051025390625, "learning_rate": 1.4632819553311715e-05, "loss": 0.002, "num_tokens": 1156809261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41239225057608603, "frac_reward_zero_std": 1.0, "grad_norm": 0.02571451529547636, "kl": 0.04534912109375, "learning_rate": 1.4627538788696691e-05, "loss": 0.0018, "num_tokens": 1157371021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41256294273278143, "frac_reward_zero_std": 1.0, "grad_norm": 0.03774188081805539, "kl": 0.0458984375, "learning_rate": 1.4622256381472442e-05, "loss": 0.0018, "num_tokens": 1157936749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41273363488947684, "frac_reward_zero_std": 1.0, "grad_norm": 0.028765474078997197, "kl": 0.04486083984375, "learning_rate": 1.4616972333514028e-05, "loss": 0.0018, "num_tokens": 1158501869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41290432704617225, "frac_reward_zero_std": 1.0, "grad_norm": 0.030195700664010262, "kl": 0.04437255859375, "learning_rate": 1.4611686646697103e-05, "loss": 0.0018, "num_tokens": 1159064189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41307501920286765, "frac_reward_zero_std": 1.0, "grad_norm": 0.023411171528238742, "kl": 0.04266357421875, "learning_rate": 1.4606399322897889e-05, "loss": 0.0017, "num_tokens": 1159626125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.413245711359563, "frac_reward_zero_std": 1.0, "grad_norm": 0.01807299414519828, "kl": 0.0447998046875, "learning_rate": 1.4601110363993196e-05, "loss": 0.0018, "num_tokens": 1160194525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4134164035162584, "frac_reward_zero_std": 1.0, "grad_norm": 0.017274285492418648, "kl": 0.0438232421875, "learning_rate": 1.4595819771860415e-05, "loss": 0.0018, "num_tokens": 1160773021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4135870956729538, "frac_reward_zero_std": 1.0, "grad_norm": 0.017208032776199002, "kl": 0.04632568359375, "learning_rate": 1.4590527548377513e-05, "loss": 0.0019, "num_tokens": 1161338989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4137577878296492, "frac_reward_zero_std": 1.0, "grad_norm": 0.06274036327834205, "kl": 0.0570068359375, "learning_rate": 1.4585233695423043e-05, "loss": 0.0023, "num_tokens": 1161905581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41392847998634463, "frac_reward_zero_std": 1.0, "grad_norm": 0.011669150183233262, "kl": 0.0499267578125, "learning_rate": 1.4579938214876126e-05, "loss": 0.002, "num_tokens": 1162474605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41409917214304004, "frac_reward_zero_std": 1.0, "grad_norm": 0.019989890724195186, "kl": 0.0511474609375, "learning_rate": 1.457464110861647e-05, "loss": 0.002, "num_tokens": 1163039197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41426986429973545, "frac_reward_zero_std": 1.0, "grad_norm": 0.015981916255188983, "kl": 0.050048828125, "learning_rate": 1.4569342378524356e-05, "loss": 0.002, "num_tokens": 1163605085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41444055645643085, "frac_reward_zero_std": 1.0, "grad_norm": 0.015501477545414257, "kl": 0.0518798828125, "learning_rate": 1.456404202648064e-05, "loss": 0.0021, "num_tokens": 1164168749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4146112486131262, "frac_reward_zero_std": 1.0, "grad_norm": 0.022654767249655313, "kl": 0.0577392578125, "learning_rate": 1.4558740054366758e-05, "loss": 0.0023, "num_tokens": 1164735325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4147819407698216, "frac_reward_zero_std": 1.0, "grad_norm": 0.028112855100134303, "kl": 0.0587158203125, "learning_rate": 1.4553436464064719e-05, "loss": 0.0023, "num_tokens": 1165294829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.414952632926517, "frac_reward_zero_std": 1.0, "grad_norm": 0.011492897069853378, "kl": 0.05474853515625, "learning_rate": 1.4548131257457103e-05, "loss": 0.0022, "num_tokens": 1165864733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4151233250832124, "frac_reward_zero_std": 1.0, "grad_norm": 0.11286453211041303, "kl": 0.07562255859375, "learning_rate": 1.4542824436427068e-05, "loss": 0.003, "num_tokens": 1166427565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41529401723990783, "frac_reward_zero_std": 1.0, "grad_norm": 0.1263040001172735, "kl": 0.083251953125, "learning_rate": 1.4537516002858348e-05, "loss": 0.0033, "num_tokens": 1166991005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41546470939660324, "frac_reward_zero_std": 1.0, "grad_norm": 0.015529410375758624, "kl": 0.05859375, "learning_rate": 1.4532205958635237e-05, "loss": 0.0023, "num_tokens": 1167555805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41563540155329864, "frac_reward_zero_std": 1.0, "grad_norm": 0.021674112955826646, "kl": 0.07281494140625, "learning_rate": 1.4526894305642618e-05, "loss": 0.0029, "num_tokens": 1168120013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41580609370999405, "frac_reward_zero_std": 1.0, "grad_norm": 0.07476869467839117, "kl": 0.079345703125, "learning_rate": 1.4521581045765928e-05, "loss": 0.0032, "num_tokens": 1168695085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4159767858666894, "frac_reward_zero_std": 1.0, "grad_norm": 0.12345893212124112, "kl": 0.0845947265625, "learning_rate": 1.4516266180891191e-05, "loss": 0.0034, "num_tokens": 1169268621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4161474780233848, "frac_reward_zero_std": 1.0, "grad_norm": 0.014513251430681386, "kl": 0.064697265625, "learning_rate": 1.4510949712904985e-05, "loss": 0.0026, "num_tokens": 1169835181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4163181701800802, "frac_reward_zero_std": 1.0, "grad_norm": 0.017335371786443874, "kl": 0.0814208984375, "learning_rate": 1.4505631643694469e-05, "loss": 0.0033, "num_tokens": 1170401789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4164888623367756, "frac_reward_zero_std": 1.0, "grad_norm": 0.03266533702142282, "kl": 0.0887451171875, "learning_rate": 1.450031197514736e-05, "loss": 0.0035, "num_tokens": 1170968045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41665955449347103, "frac_reward_zero_std": 1.0, "grad_norm": 0.02761995481903102, "kl": 0.1033935546875, "learning_rate": 1.4494990709151957e-05, "loss": 0.0041, "num_tokens": 1171533053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41683024665016644, "frac_reward_zero_std": 1.0, "grad_norm": 0.02930471762783931, "kl": 0.1058349609375, "learning_rate": 1.4489667847597106e-05, "loss": 0.0042, "num_tokens": 1172099293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41700093880686184, "frac_reward_zero_std": 1.0, "grad_norm": 0.015596401305783231, "kl": 0.119384765625, "learning_rate": 1.4484343392372242e-05, "loss": 0.0048, "num_tokens": 1172664445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41717163096355725, "frac_reward_zero_std": 1.0, "grad_norm": 0.03421647497448011, "kl": 0.143310546875, "learning_rate": 1.4479017345367345e-05, "loss": 0.0057, "num_tokens": 1173228285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4173423231202526, "frac_reward_zero_std": 1.0, "grad_norm": 0.04727860199786797, "kl": 0.173583984375, "learning_rate": 1.4473689708472975e-05, "loss": 0.0069, "num_tokens": 1173797565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.417513015276948, "frac_reward_zero_std": 1.0, "grad_norm": 0.03690591853355926, "kl": 0.162109375, "learning_rate": 1.446836048358025e-05, "loss": 0.0065, "num_tokens": 1174362269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4176837074336434, "frac_reward_zero_std": 1.0, "grad_norm": 0.048330900680154686, "kl": 0.168212890625, "learning_rate": 1.4463029672580853e-05, "loss": 0.0067, "num_tokens": 1174926285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4178543995903388, "frac_reward_zero_std": 1.0, "grad_norm": 0.052639848053069406, "kl": 0.161376953125, "learning_rate": 1.4457697277367026e-05, "loss": 0.0064, "num_tokens": 1175489277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4180250917470342, "frac_reward_zero_std": 1.0, "grad_norm": 0.03616414471699411, "kl": 0.1353759765625, "learning_rate": 1.4452363299831582e-05, "loss": 0.0054, "num_tokens": 1176055853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41819578390372963, "frac_reward_zero_std": 1.0, "grad_norm": 0.016523620913833425, "kl": 0.1124267578125, "learning_rate": 1.4447027741867886e-05, "loss": 0.0045, "num_tokens": 1176618525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41836647606042504, "frac_reward_zero_std": 1.0, "grad_norm": 0.014161748979832666, "kl": 0.107666015625, "learning_rate": 1.4441690605369869e-05, "loss": 0.0043, "num_tokens": 1177187213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41853716821712045, "frac_reward_zero_std": 1.0, "grad_norm": 0.013426860705182684, "kl": 0.120361328125, "learning_rate": 1.4436351892232025e-05, "loss": 0.0048, "num_tokens": 1177751069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4187078603738158, "frac_reward_zero_std": 1.0, "grad_norm": 0.022958253385272027, "kl": 0.104736328125, "learning_rate": 1.4431011604349402e-05, "loss": 0.0042, "num_tokens": 1178316765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4188785525305112, "frac_reward_zero_std": 1.0, "grad_norm": 0.016987667465696604, "kl": 0.0987548828125, "learning_rate": 1.4425669743617609e-05, "loss": 0.0039, "num_tokens": 1178882637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4190492446872066, "frac_reward_zero_std": 1.0, "grad_norm": 0.048096024534464196, "kl": 0.10986328125, "learning_rate": 1.4420326311932815e-05, "loss": 0.0044, "num_tokens": 1179453933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.419219936843902, "frac_reward_zero_std": 1.0, "grad_norm": 0.03803797981823239, "kl": 0.1058349609375, "learning_rate": 1.4414981311191744e-05, "loss": 0.0042, "num_tokens": 1180019053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4193906290005974, "frac_reward_zero_std": 1.0, "grad_norm": 0.01714799727574015, "kl": 0.0843505859375, "learning_rate": 1.440963474329168e-05, "loss": 0.0034, "num_tokens": 1180586637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41956132115729283, "frac_reward_zero_std": 1.0, "grad_norm": 0.07213986065567494, "kl": 0.110595703125, "learning_rate": 1.440428661013046e-05, "loss": 0.0044, "num_tokens": 1181152797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41973201331398824, "frac_reward_zero_std": 1.0, "grad_norm": 0.027521496357356363, "kl": 0.0858154296875, "learning_rate": 1.439893691360648e-05, "loss": 0.0034, "num_tokens": 1181725597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41990270547068365, "frac_reward_zero_std": 1.0, "grad_norm": 0.02109211162719867, "kl": 0.09619140625, "learning_rate": 1.439358565561869e-05, "loss": 0.0039, "num_tokens": 1182288109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.420073397627379, "frac_reward_zero_std": 1.0, "grad_norm": 0.01645030191622638, "kl": 0.099853515625, "learning_rate": 1.4388232838066587e-05, "loss": 0.004, "num_tokens": 1182851245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4202440897840744, "frac_reward_zero_std": 1.0, "grad_norm": 0.030853027876463598, "kl": 0.1246337890625, "learning_rate": 1.4382878462850233e-05, "loss": 0.005, "num_tokens": 1183418333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4204147819407698, "frac_reward_zero_std": 1.0, "grad_norm": 0.03734432728504003, "kl": 0.1268310546875, "learning_rate": 1.4377522531870242e-05, "loss": 0.0051, "num_tokens": 1183991677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4205854740974652, "frac_reward_zero_std": 1.0, "grad_norm": 0.024148148269332904, "kl": 0.130859375, "learning_rate": 1.437216504702777e-05, "loss": 0.0052, "num_tokens": 1184562765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4207561662541606, "frac_reward_zero_std": 1.0, "grad_norm": 0.024336183394817017, "kl": 0.119873046875, "learning_rate": 1.436680601022453e-05, "loss": 0.0048, "num_tokens": 1185128589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42092685841085603, "frac_reward_zero_std": 1.0, "grad_norm": 0.04107606334930191, "kl": 0.136474609375, "learning_rate": 1.436144542336279e-05, "loss": 0.0055, "num_tokens": 1185693357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42109755056755144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0314894610950026, "kl": 0.13720703125, "learning_rate": 1.4356083288345367e-05, "loss": 0.0055, "num_tokens": 1186258253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42126824272424684, "frac_reward_zero_std": 1.0, "grad_norm": 0.03440798980084633, "kl": 0.174072265625, "learning_rate": 1.4350719607075616e-05, "loss": 0.007, "num_tokens": 1186821197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4214389348809422, "frac_reward_zero_std": 1.0, "grad_norm": 0.025919895250401892, "kl": 0.15234375, "learning_rate": 1.4345354381457463e-05, "loss": 0.0061, "num_tokens": 1187385901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4216096270376376, "frac_reward_zero_std": 1.0, "grad_norm": 0.01321236752947622, "kl": 0.1279296875, "learning_rate": 1.433998761339536e-05, "loss": 0.0051, "num_tokens": 1187959613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.421780319194333, "frac_reward_zero_std": 1.0, "grad_norm": 0.021318030719797276, "kl": 0.1451416015625, "learning_rate": 1.4334619304794317e-05, "loss": 0.0058, "num_tokens": 1188532365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4219510113510284, "frac_reward_zero_std": 1.0, "grad_norm": 0.029199839218886515, "kl": 0.1416015625, "learning_rate": 1.4329249457559892e-05, "loss": 0.0057, "num_tokens": 1189107741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4221217035077238, "frac_reward_zero_std": 1.0, "grad_norm": 0.022312003126699707, "kl": 0.1708984375, "learning_rate": 1.432387807359819e-05, "loss": 0.0068, "num_tokens": 1189670333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42229239566441923, "frac_reward_zero_std": 1.0, "grad_norm": 0.026639906036562418, "kl": 0.140869140625, "learning_rate": 1.4318505154815851e-05, "loss": 0.0056, "num_tokens": 1190238925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42246308782111464, "frac_reward_zero_std": 1.0, "grad_norm": 0.023641661277251886, "kl": 0.1455078125, "learning_rate": 1.4313130703120073e-05, "loss": 0.0058, "num_tokens": 1190804989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42263377997781004, "frac_reward_zero_std": 1.0, "grad_norm": 0.032605646752731435, "kl": 0.1356201171875, "learning_rate": 1.4307754720418592e-05, "loss": 0.0054, "num_tokens": 1191371709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4228044721345054, "frac_reward_zero_std": 1.0, "grad_norm": 0.029462216071616008, "kl": 0.14111328125, "learning_rate": 1.4302377208619684e-05, "loss": 0.0056, "num_tokens": 1191935613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4229751642912008, "frac_reward_zero_std": 1.0, "grad_norm": 0.016291647269113044, "kl": 0.1318359375, "learning_rate": 1.4296998169632175e-05, "loss": 0.0053, "num_tokens": 1192502045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4231458564478962, "frac_reward_zero_std": 1.0, "grad_norm": 0.048449437293646, "kl": 0.1229248046875, "learning_rate": 1.4291617605365427e-05, "loss": 0.0049, "num_tokens": 1193070397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4233165486045916, "frac_reward_zero_std": 1.0, "grad_norm": 0.04893344588200268, "kl": 0.1396484375, "learning_rate": 1.4286235517729348e-05, "loss": 0.0056, "num_tokens": 1193636061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.423487240761287, "frac_reward_zero_std": 1.0, "grad_norm": 0.01752772446273195, "kl": 0.139404296875, "learning_rate": 1.4280851908634386e-05, "loss": 0.0056, "num_tokens": 1194198365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4236579329179824, "frac_reward_zero_std": 1.0, "grad_norm": 0.014133866241243178, "kl": 0.1297607421875, "learning_rate": 1.4275466779991526e-05, "loss": 0.0052, "num_tokens": 1194766701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42382862507467783, "frac_reward_zero_std": 1.0, "grad_norm": 0.09211393220164908, "kl": 0.162353515625, "learning_rate": 1.4270080133712292e-05, "loss": 0.0065, "num_tokens": 1195336813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42399931723137324, "frac_reward_zero_std": 1.0, "grad_norm": 0.016175267963348724, "kl": 0.14111328125, "learning_rate": 1.4264691971708753e-05, "loss": 0.0056, "num_tokens": 1195908173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4241700093880686, "frac_reward_zero_std": 1.0, "grad_norm": 0.02391476310495174, "kl": 0.142578125, "learning_rate": 1.4259302295893511e-05, "loss": 0.0057, "num_tokens": 1196473821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.424340701544764, "frac_reward_zero_std": 1.0, "grad_norm": 0.027145287400450257, "kl": 0.160400390625, "learning_rate": 1.4253911108179708e-05, "loss": 0.0064, "num_tokens": 1197039149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4245113937014594, "frac_reward_zero_std": 1.0, "grad_norm": 0.03607309582877155, "kl": 0.179443359375, "learning_rate": 1.4248518410481016e-05, "loss": 0.0072, "num_tokens": 1197606845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4246820858581548, "frac_reward_zero_std": 1.0, "grad_norm": 0.04441912227578026, "kl": 0.20849609375, "learning_rate": 1.4243124204711651e-05, "loss": 0.0083, "num_tokens": 1198172445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4248527780148502, "frac_reward_zero_std": 1.0, "grad_norm": 0.07078976225861644, "kl": 0.225830078125, "learning_rate": 1.4237728492786365e-05, "loss": 0.009, "num_tokens": 1198734877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4250234701715456, "frac_reward_zero_std": 1.0, "grad_norm": 0.14940247810766846, "kl": 0.30908203125, "learning_rate": 1.4232331276620433e-05, "loss": 0.0124, "num_tokens": 1199303469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42519416232824103, "frac_reward_zero_std": 1.0, "grad_norm": 0.07667959836490791, "kl": 0.217529296875, "learning_rate": 1.4226932558129677e-05, "loss": 0.0087, "num_tokens": 1199871437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42536485448493644, "frac_reward_zero_std": 1.0, "grad_norm": 0.08271519161071744, "kl": 0.165771484375, "learning_rate": 1.4221532339230448e-05, "loss": 0.0066, "num_tokens": 1200445677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4255355466416318, "frac_reward_zero_std": 1.0, "grad_norm": 0.06860754935004176, "kl": 0.1357421875, "learning_rate": 1.4216130621839626e-05, "loss": 0.0054, "num_tokens": 1201003101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4257062387983272, "frac_reward_zero_std": 1.0, "grad_norm": 0.3884390452958926, "kl": 0.15283203125, "learning_rate": 1.4210727407874624e-05, "loss": 0.0061, "num_tokens": 1201591661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4258769309550226, "frac_reward_zero_std": 1.0, "grad_norm": 0.0550865683465106, "kl": 0.1041259765625, "learning_rate": 1.4205322699253397e-05, "loss": 0.0042, "num_tokens": 1202156413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.426047623111718, "frac_reward_zero_std": 1.0, "grad_norm": 0.06769948456365121, "kl": 0.1009521484375, "learning_rate": 1.4199916497894411e-05, "loss": 0.004, "num_tokens": 1202728237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4262183152684134, "frac_reward_zero_std": 1.0, "grad_norm": 0.04255581993686642, "kl": 0.1011962890625, "learning_rate": 1.4194508805716679e-05, "loss": 0.004, "num_tokens": 1203290125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4263890074251088, "frac_reward_zero_std": 1.0, "grad_norm": 0.041398842121401803, "kl": 0.0853271484375, "learning_rate": 1.4189099624639729e-05, "loss": 0.0034, "num_tokens": 1203854061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42655969958180423, "frac_reward_zero_std": 1.0, "grad_norm": 0.02193641767201429, "kl": 0.0684814453125, "learning_rate": 1.4183688956583637e-05, "loss": 0.0027, "num_tokens": 1204417869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42673039173849964, "frac_reward_zero_std": 1.0, "grad_norm": 0.13558544760045127, "kl": 0.07427978515625, "learning_rate": 1.4178276803468983e-05, "loss": 0.003, "num_tokens": 1204980653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.426901083895195, "frac_reward_zero_std": 1.0, "grad_norm": 0.03889554400909651, "kl": 0.06268310546875, "learning_rate": 1.4172863167216892e-05, "loss": 0.0025, "num_tokens": 1205545229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4270717760518904, "frac_reward_zero_std": 1.0, "grad_norm": 0.006909175458899834, "kl": 0.0614013671875, "learning_rate": 1.4167448049749009e-05, "loss": 0.0025, "num_tokens": 1206106525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4272424682085858, "frac_reward_zero_std": 1.0, "grad_norm": 0.02299491017337434, "kl": 0.07177734375, "learning_rate": 1.4162031452987505e-05, "loss": 0.0029, "num_tokens": 1206666989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4274131603652812, "frac_reward_zero_std": 1.0, "grad_norm": 0.09630217402955314, "kl": 0.096435546875, "learning_rate": 1.4156613378855072e-05, "loss": 0.0039, "num_tokens": 1207232365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4275838525219766, "frac_reward_zero_std": 1.0, "grad_norm": 0.06727766406851944, "kl": 0.092041015625, "learning_rate": 1.415119382927494e-05, "loss": 0.0037, "num_tokens": 1207795309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.427754544678672, "frac_reward_zero_std": 1.0, "grad_norm": 0.053427704790008364, "kl": 0.0894775390625, "learning_rate": 1.4145772806170846e-05, "loss": 0.0036, "num_tokens": 1208368877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42792523683536743, "frac_reward_zero_std": 1.0, "grad_norm": 0.028881891245904744, "kl": 0.078369140625, "learning_rate": 1.414035031146706e-05, "loss": 0.0031, "num_tokens": 1208937037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42809592899206284, "frac_reward_zero_std": 1.0, "grad_norm": 0.028838007598349015, "kl": 0.0810546875, "learning_rate": 1.413492634708837e-05, "loss": 0.0032, "num_tokens": 1209502109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42826662114875824, "frac_reward_zero_std": 1.0, "grad_norm": 0.18466188275645878, "kl": 0.1004638671875, "learning_rate": 1.412950091496009e-05, "loss": 0.004, "num_tokens": 1210070477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4284373133054536, "frac_reward_zero_std": 1.0, "grad_norm": 0.06154442385260799, "kl": 0.1287841796875, "learning_rate": 1.4124074017008052e-05, "loss": 0.0051, "num_tokens": 1210634525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.428608005462149, "frac_reward_zero_std": 1.0, "grad_norm": 0.06112025492688399, "kl": 0.15478515625, "learning_rate": 1.411864565515861e-05, "loss": 0.0062, "num_tokens": 1211199053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4287786976188444, "frac_reward_zero_std": 1.0, "grad_norm": 0.11665832238218532, "kl": 0.253173828125, "learning_rate": 1.4113215831338631e-05, "loss": 0.0101, "num_tokens": 1211762925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4289493897755398, "frac_reward_zero_std": 1.0, "grad_norm": 0.17919150658139696, "kl": 0.330078125, "learning_rate": 1.4107784547475514e-05, "loss": 0.0132, "num_tokens": 1212330173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4291200819322352, "frac_reward_zero_std": 1.0, "grad_norm": 0.1371628964087462, "kl": 0.292724609375, "learning_rate": 1.4102351805497164e-05, "loss": 0.0117, "num_tokens": 1212891437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4292907740889306, "frac_reward_zero_std": 1.0, "grad_norm": 0.09356795033746194, "kl": 0.210205078125, "learning_rate": 1.4096917607332009e-05, "loss": 0.0084, "num_tokens": 1213457949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42946146624562603, "frac_reward_zero_std": 1.0, "grad_norm": 0.034279168589907226, "kl": 0.1563720703125, "learning_rate": 1.4091481954908992e-05, "loss": 0.0062, "num_tokens": 1214021837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42963215840232144, "frac_reward_zero_std": 1.0, "grad_norm": 0.014163691587341242, "kl": 0.0980224609375, "learning_rate": 1.4086044850157578e-05, "loss": 0.0039, "num_tokens": 1214588845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4298028505590168, "frac_reward_zero_std": 1.0, "grad_norm": 0.025265692500008437, "kl": 0.107177734375, "learning_rate": 1.408060629500774e-05, "loss": 0.0043, "num_tokens": 1215152109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4299735427157122, "frac_reward_zero_std": 1.0, "grad_norm": 0.026400207163890004, "kl": 0.077880859375, "learning_rate": 1.4075166291389968e-05, "loss": 0.0031, "num_tokens": 1215720445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4301442348724076, "frac_reward_zero_std": 1.0, "grad_norm": 0.02390433073270827, "kl": 0.082763671875, "learning_rate": 1.4069724841235267e-05, "loss": 0.0033, "num_tokens": 1216287741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.430314927029103, "frac_reward_zero_std": 1.0, "grad_norm": 0.016730612313727918, "kl": 0.0767822265625, "learning_rate": 1.4064281946475156e-05, "loss": 0.0031, "num_tokens": 1216851165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4304856191857984, "frac_reward_zero_std": 1.0, "grad_norm": 0.02405919756553926, "kl": 0.08935546875, "learning_rate": 1.4058837609041664e-05, "loss": 0.0036, "num_tokens": 1217416397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4306563113424938, "frac_reward_zero_std": 1.0, "grad_norm": 0.034400036400432946, "kl": 0.0780029296875, "learning_rate": 1.405339183086734e-05, "loss": 0.0031, "num_tokens": 1217986365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43082700349918923, "frac_reward_zero_std": 1.0, "grad_norm": 0.027041990014541013, "kl": 0.080078125, "learning_rate": 1.4047944613885231e-05, "loss": 0.0032, "num_tokens": 1218548333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43099769565588464, "frac_reward_zero_std": 1.0, "grad_norm": 0.02775533133643994, "kl": 0.0750732421875, "learning_rate": 1.4042495960028912e-05, "loss": 0.003, "num_tokens": 1219110173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43116838781258, "frac_reward_zero_std": 1.0, "grad_norm": 0.03344902816276907, "kl": 0.110107421875, "learning_rate": 1.403704587123245e-05, "loss": 0.0044, "num_tokens": 1219675485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4313390799692754, "frac_reward_zero_std": 1.0, "grad_norm": 0.016880678113808542, "kl": 0.086669921875, "learning_rate": 1.4031594349430434e-05, "loss": 0.0035, "num_tokens": 1220246637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4315097721259708, "frac_reward_zero_std": 1.0, "grad_norm": 0.09150457152701268, "kl": 0.1287841796875, "learning_rate": 1.4026141396557953e-05, "loss": 0.0052, "num_tokens": 1220816317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4316804642826662, "frac_reward_zero_std": 1.0, "grad_norm": 0.14597360071121276, "kl": 0.1708984375, "learning_rate": 1.4020687014550615e-05, "loss": 0.0068, "num_tokens": 1221376957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4318511564393616, "frac_reward_zero_std": 1.0, "grad_norm": 0.06996098876818552, "kl": 0.197509765625, "learning_rate": 1.4015231205344521e-05, "loss": 0.0079, "num_tokens": 1221950541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.432021848596057, "frac_reward_zero_std": 1.0, "grad_norm": 0.14632626192897938, "kl": 0.298828125, "learning_rate": 1.4009773970876297e-05, "loss": 0.012, "num_tokens": 1222518493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43219254075275243, "frac_reward_zero_std": 1.0, "grad_norm": 0.12932224348399274, "kl": 0.3095703125, "learning_rate": 1.4004315313083052e-05, "loss": 0.0124, "num_tokens": 1223086413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43236323290944784, "frac_reward_zero_std": 1.0, "grad_norm": 0.058953251293669635, "kl": 0.341796875, "learning_rate": 1.3998855233902422e-05, "loss": 0.0137, "num_tokens": 1223654125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4325339250661432, "frac_reward_zero_std": 1.0, "grad_norm": 0.05390696258372596, "kl": 0.28564453125, "learning_rate": 1.3993393735272532e-05, "loss": 0.0114, "num_tokens": 1224226525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4327046172228386, "frac_reward_zero_std": 1.0, "grad_norm": 0.0638270875279706, "kl": 0.229736328125, "learning_rate": 1.398793081913202e-05, "loss": 0.0092, "num_tokens": 1224795869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.432875309379534, "frac_reward_zero_std": 1.0, "grad_norm": 0.06963333625838417, "kl": 0.218505859375, "learning_rate": 1.3982466487420023e-05, "loss": 0.0087, "num_tokens": 1225358765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4330460015362294, "frac_reward_zero_std": 0.9375, "grad_norm": 0.07270851141743319, "kl": 0.212158203125, "learning_rate": 1.3977000742076184e-05, "loss": 0.0085, "num_tokens": 1225921181.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 2537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4332166936929248, "frac_reward_zero_std": 1.0, "grad_norm": 0.09957570582752519, "kl": 0.238037109375, "learning_rate": 1.3971533585040639e-05, "loss": 0.0095, "num_tokens": 1226485677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4333873858496202, "frac_reward_zero_std": 1.0, "grad_norm": 0.09609246814813037, "kl": 0.2412109375, "learning_rate": 1.3966065018254039e-05, "loss": 0.0097, "num_tokens": 1227048141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43355807800631563, "frac_reward_zero_std": 1.0, "grad_norm": 1.480287339845365, "kl": 0.469482421875, "learning_rate": 1.3960595043657523e-05, "loss": 0.0188, "num_tokens": 1227611325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43372877016301103, "frac_reward_zero_std": 1.0, "grad_norm": 0.1430021646012186, "kl": 0.325439453125, "learning_rate": 1.3955123663192736e-05, "loss": 0.013, "num_tokens": 1228178349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4338994623197064, "frac_reward_zero_std": 1.0, "grad_norm": 0.3212334102069848, "kl": 0.57421875, "learning_rate": 1.3949650878801819e-05, "loss": 0.023, "num_tokens": 1228742797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4340701544764018, "frac_reward_zero_std": 1.0, "grad_norm": 0.3371864673617274, "kl": 0.6650390625, "learning_rate": 1.3944176692427415e-05, "loss": 0.0266, "num_tokens": 1229310557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4342408466330972, "frac_reward_zero_std": 1.0, "grad_norm": 0.2393011935213605, "kl": 0.58203125, "learning_rate": 1.3938701106012664e-05, "loss": 0.0233, "num_tokens": 1229875821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4344115387897926, "frac_reward_zero_std": 0.9375, "grad_norm": 0.146529795557046, "kl": 0.40234375, "learning_rate": 1.3933224121501196e-05, "loss": 0.0161, "num_tokens": 1230438141.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 2545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.434582230946488, "frac_reward_zero_std": 1.0, "grad_norm": 0.7013351235255076, "kl": 0.4248046875, "learning_rate": 1.3927745740837149e-05, "loss": 0.017, "num_tokens": 1231001933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4347529231031834, "frac_reward_zero_std": 1.0, "grad_norm": 0.17915371255863927, "kl": 0.25341796875, "learning_rate": 1.3922265965965145e-05, "loss": 0.0101, "num_tokens": 1231564861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4349236152598788, "frac_reward_zero_std": 1.0, "grad_norm": 0.10323897466302917, "kl": 0.231201171875, "learning_rate": 1.391678479883031e-05, "loss": 0.0092, "num_tokens": 1232127949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43509430741657423, "frac_reward_zero_std": 1.0, "grad_norm": 0.11191129514285601, "kl": 0.254638671875, "learning_rate": 1.391130224137826e-05, "loss": 0.0102, "num_tokens": 1232690605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4352649995732696, "frac_reward_zero_std": 1.0, "grad_norm": 0.044276627517058784, "kl": 0.221923828125, "learning_rate": 1.3905818295555102e-05, "loss": 0.0089, "num_tokens": 1233250093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.435435691729965, "frac_reward_zero_std": 1.0, "grad_norm": 0.8370252098815509, "kl": 0.32470703125, "learning_rate": 1.3900332963307444e-05, "loss": 0.013, "num_tokens": 1233816317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4356063838866604, "frac_reward_zero_std": 1.0, "grad_norm": 0.06370879675250136, "kl": 0.216064453125, "learning_rate": 1.3894846246582376e-05, "loss": 0.0086, "num_tokens": 1234380333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4357770760433558, "frac_reward_zero_std": 1.0, "grad_norm": 0.22641364286243693, "kl": 0.39404296875, "learning_rate": 1.3889358147327484e-05, "loss": 0.0158, "num_tokens": 1234944493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4359477682000512, "frac_reward_zero_std": 1.0, "grad_norm": 0.11372493581423151, "kl": 0.3154296875, "learning_rate": 1.388386866749085e-05, "loss": 0.0126, "num_tokens": 1235503949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4361184603567466, "frac_reward_zero_std": 1.0, "grad_norm": 0.11209151063481619, "kl": 0.33837890625, "learning_rate": 1.3878377809021042e-05, "loss": 0.0135, "num_tokens": 1236073261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.436289152513442, "frac_reward_zero_std": 1.0, "grad_norm": 0.30266447923423717, "kl": 0.4970703125, "learning_rate": 1.3872885573867105e-05, "loss": 0.0199, "num_tokens": 1236635917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43645984467013743, "frac_reward_zero_std": 1.0, "grad_norm": 0.48236494846980343, "kl": 0.595703125, "learning_rate": 1.3867391963978601e-05, "loss": 0.0238, "num_tokens": 1237200237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4366305368268328, "frac_reward_zero_std": 1.0, "grad_norm": 0.3509823752128253, "kl": 0.59765625, "learning_rate": 1.386189698130555e-05, "loss": 0.0239, "num_tokens": 1237769677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4368012289835282, "frac_reward_zero_std": 1.0, "grad_norm": 0.238189336240985, "kl": 0.658203125, "learning_rate": 1.385640062779848e-05, "loss": 0.0263, "num_tokens": 1238331837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4369719211402236, "frac_reward_zero_std": 1.0, "grad_norm": 0.09001257546175043, "kl": 0.6416015625, "learning_rate": 1.385090290540839e-05, "loss": 0.0256, "num_tokens": 1238894605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.437142613296919, "frac_reward_zero_std": 1.0, "grad_norm": 0.05866133543741246, "kl": 0.50634765625, "learning_rate": 1.3845403816086784e-05, "loss": 0.0202, "num_tokens": 1239463181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4373133054536144, "frac_reward_zero_std": 1.0, "grad_norm": 0.27149484571703086, "kl": 0.41552734375, "learning_rate": 1.383990336178563e-05, "loss": 0.0166, "num_tokens": 1240036621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4374839976103098, "frac_reward_zero_std": 1.0, "grad_norm": 0.1708097349998693, "kl": 0.34765625, "learning_rate": 1.3834401544457398e-05, "loss": 0.0139, "num_tokens": 1240599853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4376546897670052, "frac_reward_zero_std": 1.0, "grad_norm": 0.11502625176635248, "kl": 0.24267578125, "learning_rate": 1.3828898366055028e-05, "loss": 0.0097, "num_tokens": 1241164845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43782538192370063, "frac_reward_zero_std": 1.0, "grad_norm": 0.09697108319660513, "kl": 0.18798828125, "learning_rate": 1.3823393828531958e-05, "loss": 0.0075, "num_tokens": 1241728829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.437996074080396, "frac_reward_zero_std": 1.0, "grad_norm": 0.05942652182375378, "kl": 0.181640625, "learning_rate": 1.3817887933842093e-05, "loss": 0.0073, "num_tokens": 1242293821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4381667662370914, "frac_reward_zero_std": 1.0, "grad_norm": 0.0332017232019055, "kl": 0.11767578125, "learning_rate": 1.3812380683939828e-05, "loss": 0.0047, "num_tokens": 1242874877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4383374583937868, "frac_reward_zero_std": 1.0, "grad_norm": 0.09718929910618049, "kl": 0.15771484375, "learning_rate": 1.3806872080780043e-05, "loss": 0.0063, "num_tokens": 1243438669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4385081505504822, "frac_reward_zero_std": 1.0, "grad_norm": 0.04932607326519029, "kl": 0.169921875, "learning_rate": 1.3801362126318085e-05, "loss": 0.0068, "num_tokens": 1244000605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4386788427071776, "frac_reward_zero_std": 1.0, "grad_norm": 0.021580746930067674, "kl": 0.1324462890625, "learning_rate": 1.3795850822509798e-05, "loss": 0.0053, "num_tokens": 1244564557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.438849534863873, "frac_reward_zero_std": 1.0, "grad_norm": 0.03906892781807678, "kl": 0.126708984375, "learning_rate": 1.3790338171311488e-05, "loss": 0.0051, "num_tokens": 1245134013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4390202270205684, "frac_reward_zero_std": 1.0, "grad_norm": 0.028616564157713718, "kl": 0.1962890625, "learning_rate": 1.3784824174679954e-05, "loss": 0.0078, "num_tokens": 1245697981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4391909191772638, "frac_reward_zero_std": 1.0, "grad_norm": 0.0349546689447926, "kl": 0.24560546875, "learning_rate": 1.3779308834572458e-05, "loss": 0.0098, "num_tokens": 1246258749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4393616113339592, "frac_reward_zero_std": 1.0, "grad_norm": 0.08092221883594618, "kl": 0.283935546875, "learning_rate": 1.3773792152946754e-05, "loss": 0.0114, "num_tokens": 1246828189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4395323034906546, "frac_reward_zero_std": 1.0, "grad_norm": 0.08478126972799636, "kl": 0.35986328125, "learning_rate": 1.3768274131761061e-05, "loss": 0.0144, "num_tokens": 1247399373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43970299564735, "frac_reward_zero_std": 1.0, "grad_norm": 0.050106629133619, "kl": 0.44384765625, "learning_rate": 1.3762754772974076e-05, "loss": 0.0178, "num_tokens": 1247961661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4398736878040454, "frac_reward_zero_std": 1.0, "grad_norm": 0.27182541284456463, "kl": 0.52490234375, "learning_rate": 1.3757234078544976e-05, "loss": 0.021, "num_tokens": 1248524333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4400443799607408, "frac_reward_zero_std": 1.0, "grad_norm": 0.2169257534131543, "kl": 0.61328125, "learning_rate": 1.3751712050433408e-05, "loss": 0.0245, "num_tokens": 1249091709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4402150721174362, "frac_reward_zero_std": 1.0, "grad_norm": 0.06677269549332518, "kl": 0.63671875, "learning_rate": 1.3746188690599488e-05, "loss": 0.0255, "num_tokens": 1249657021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4403857642741316, "frac_reward_zero_std": 1.0, "grad_norm": 0.08629768054453917, "kl": 0.6953125, "learning_rate": 1.3740664001003816e-05, "loss": 0.0278, "num_tokens": 1250221693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.440556456430827, "frac_reward_zero_std": 1.0, "grad_norm": 0.6972257221697521, "kl": 0.73828125, "learning_rate": 1.3735137983607452e-05, "loss": 0.0296, "num_tokens": 1250790253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4407271485875224, "frac_reward_zero_std": 1.0, "grad_norm": 0.08874611310084239, "kl": 0.5927734375, "learning_rate": 1.3729610640371935e-05, "loss": 0.0237, "num_tokens": 1251363581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4408978407442178, "frac_reward_zero_std": 1.0, "grad_norm": 0.06019771078137027, "kl": 0.5576171875, "learning_rate": 1.3724081973259274e-05, "loss": 0.0223, "num_tokens": 1251929117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4410685329009132, "frac_reward_zero_std": 1.0, "grad_norm": 0.0913446682670032, "kl": 0.5537109375, "learning_rate": 1.3718551984231938e-05, "loss": 0.0221, "num_tokens": 1252495069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4412392250576086, "frac_reward_zero_std": 1.0, "grad_norm": 0.09342274186778325, "kl": 0.537109375, "learning_rate": 1.3713020675252885e-05, "loss": 0.0215, "num_tokens": 1253062397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.441409917214304, "frac_reward_zero_std": 1.0, "grad_norm": 0.23204388122466937, "kl": 0.5654296875, "learning_rate": 1.3707488048285522e-05, "loss": 0.0226, "num_tokens": 1253626029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4415806093709994, "frac_reward_zero_std": 1.0, "grad_norm": 0.16796909621484438, "kl": 0.541015625, "learning_rate": 1.3701954105293735e-05, "loss": 0.0217, "num_tokens": 1254192525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4417513015276948, "frac_reward_zero_std": 1.0, "grad_norm": 0.1819821815244797, "kl": 0.52685546875, "learning_rate": 1.3696418848241874e-05, "loss": 0.0211, "num_tokens": 1254754973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4419219936843902, "frac_reward_zero_std": 1.0, "grad_norm": 0.13406954343454086, "kl": 0.51025390625, "learning_rate": 1.3690882279094758e-05, "loss": 0.0204, "num_tokens": 1255319677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4420926858410856, "frac_reward_zero_std": 1.0, "grad_norm": 0.1743030187123601, "kl": 0.53515625, "learning_rate": 1.3685344399817659e-05, "loss": 0.0215, "num_tokens": 1255885597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.442263377997781, "frac_reward_zero_std": 1.0, "grad_norm": 0.16986570306523183, "kl": 0.55224609375, "learning_rate": 1.3679805212376338e-05, "loss": 0.0221, "num_tokens": 1256449501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4424340701544764, "frac_reward_zero_std": 1.0, "grad_norm": 0.14605277047920426, "kl": 0.6083984375, "learning_rate": 1.3674264718736994e-05, "loss": 0.0243, "num_tokens": 1257027629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4426047623111718, "frac_reward_zero_std": 1.0, "grad_norm": 0.11365806953280135, "kl": 0.6455078125, "learning_rate": 1.3668722920866316e-05, "loss": 0.0258, "num_tokens": 1257592685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4427754544678672, "frac_reward_zero_std": 1.0, "grad_norm": 0.11323385363178039, "kl": 0.611328125, "learning_rate": 1.3663179820731426e-05, "loss": 0.0244, "num_tokens": 1258157565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4429461466245626, "frac_reward_zero_std": 1.0, "grad_norm": 0.12449146283419008, "kl": 0.6826171875, "learning_rate": 1.3657635420299939e-05, "loss": 0.0273, "num_tokens": 1258718765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.443116838781258, "frac_reward_zero_std": 1.0, "grad_norm": 0.5236529336827438, "kl": 0.8681640625, "learning_rate": 1.3652089721539908e-05, "loss": 0.0347, "num_tokens": 1259289293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4432875309379534, "frac_reward_zero_std": 1.0, "grad_norm": 0.10557370582443953, "kl": 0.66796875, "learning_rate": 1.3646542726419857e-05, "loss": 0.0267, "num_tokens": 1259851965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4434582230946488, "frac_reward_zero_std": 1.0, "grad_norm": 0.16569564193518715, "kl": 0.6767578125, "learning_rate": 1.3640994436908775e-05, "loss": 0.027, "num_tokens": 1260415885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4436289152513442, "frac_reward_zero_std": 1.0, "grad_norm": 0.13982654373299816, "kl": 0.6748046875, "learning_rate": 1.3635444854976097e-05, "loss": 0.027, "num_tokens": 1260985453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4437996074080396, "frac_reward_zero_std": 1.0, "grad_norm": 0.11991263557950724, "kl": 0.6142578125, "learning_rate": 1.3629893982591728e-05, "loss": 0.0246, "num_tokens": 1261548941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.443970299564735, "frac_reward_zero_std": 1.0, "grad_norm": 0.10360004599478065, "kl": 0.53125, "learning_rate": 1.3624341821726028e-05, "loss": 0.0212, "num_tokens": 1262124477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4441409917214304, "frac_reward_zero_std": 1.0, "grad_norm": 0.09398486839544591, "kl": 0.45703125, "learning_rate": 1.3618788374349814e-05, "loss": 0.0183, "num_tokens": 1262689469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4443116838781258, "frac_reward_zero_std": 1.0, "grad_norm": 0.08651266179457637, "kl": 0.39794921875, "learning_rate": 1.3613233642434355e-05, "loss": 0.0159, "num_tokens": 1263254893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4444823760348212, "frac_reward_zero_std": 1.0, "grad_norm": 0.08755081629986505, "kl": 0.3662109375, "learning_rate": 1.3607677627951383e-05, "loss": 0.0146, "num_tokens": 1263825373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4446530681915166, "frac_reward_zero_std": 1.0, "grad_norm": 0.06784078992179367, "kl": 0.326171875, "learning_rate": 1.3602120332873085e-05, "loss": 0.013, "num_tokens": 1264389501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.444823760348212, "frac_reward_zero_std": 1.0, "grad_norm": 0.052832974598226876, "kl": 0.2724609375, "learning_rate": 1.3596561759172096e-05, "loss": 0.0109, "num_tokens": 1264955325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4449944525049074, "frac_reward_zero_std": 1.0, "grad_norm": 0.051565011804309904, "kl": 0.249755859375, "learning_rate": 1.3591001908821512e-05, "loss": 0.01, "num_tokens": 1265521357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4451651446616028, "frac_reward_zero_std": 1.0, "grad_norm": 0.05933054163816584, "kl": 0.229736328125, "learning_rate": 1.3585440783794878e-05, "loss": 0.0092, "num_tokens": 1266085405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4453358368182982, "frac_reward_zero_std": 1.0, "grad_norm": 0.056871016523382216, "kl": 0.21728515625, "learning_rate": 1.3579878386066194e-05, "loss": 0.0087, "num_tokens": 1266647437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4455065289749936, "frac_reward_zero_std": 1.0, "grad_norm": 0.06209646015040267, "kl": 0.207763671875, "learning_rate": 1.3574314717609906e-05, "loss": 0.0083, "num_tokens": 1267212765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.445677221131689, "frac_reward_zero_std": 1.0, "grad_norm": 0.059981986579910884, "kl": 0.185791015625, "learning_rate": 1.3568749780400921e-05, "loss": 0.0074, "num_tokens": 1267778077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4458479132883844, "frac_reward_zero_std": 1.0, "grad_norm": 0.05776434504915994, "kl": 0.178466796875, "learning_rate": 1.3563183576414587e-05, "loss": 0.0071, "num_tokens": 1268342013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4460186054450798, "frac_reward_zero_std": 1.0, "grad_norm": 0.07779148806805927, "kl": 0.174560546875, "learning_rate": 1.3557616107626707e-05, "loss": 0.007, "num_tokens": 1268912477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4461892976017752, "frac_reward_zero_std": 1.0, "grad_norm": 0.054499860204394214, "kl": 0.169677734375, "learning_rate": 1.3552047376013533e-05, "loss": 0.0068, "num_tokens": 1269476925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4463599897584706, "frac_reward_zero_std": 1.0, "grad_norm": 0.1244698933781987, "kl": 0.1708984375, "learning_rate": 1.3546477383551762e-05, "loss": 0.0068, "num_tokens": 1270040109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.446530681915166, "frac_reward_zero_std": 1.0, "grad_norm": 0.05363975020884097, "kl": 0.16064453125, "learning_rate": 1.3540906132218537e-05, "loss": 0.0064, "num_tokens": 1270606749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4467013740718614, "frac_reward_zero_std": 1.0, "grad_norm": 0.1453011020398511, "kl": 0.16064453125, "learning_rate": 1.353533362399146e-05, "loss": 0.0064, "num_tokens": 1271171261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4468720662285568, "frac_reward_zero_std": 1.0, "grad_norm": 0.6459758537660751, "kl": 0.1845703125, "learning_rate": 1.352975986084856e-05, "loss": 0.0074, "num_tokens": 1271735693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4470427583852522, "frac_reward_zero_std": 1.0, "grad_norm": 0.343280952913239, "kl": 0.150146484375, "learning_rate": 1.3524184844768328e-05, "loss": 0.006, "num_tokens": 1272309709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4472134505419476, "frac_reward_zero_std": 1.0, "grad_norm": 2294.6850552479714, "kl": 185.0946044921875, "learning_rate": 1.351860857772969e-05, "loss": 7.3964, "num_tokens": 1272879245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.447384142698643, "frac_reward_zero_std": 1.0, "grad_norm": 0.46444202213552066, "kl": 0.164306640625, "learning_rate": 1.3513031061712026e-05, "loss": 0.0066, "num_tokens": 1273450013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4475548348553384, "frac_reward_zero_std": 1.0, "grad_norm": 0.5235851502909442, "kl": 0.130859375, "learning_rate": 1.3507452298695143e-05, "loss": 0.0052, "num_tokens": 1274019469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4477255270120338, "frac_reward_zero_std": 1.0, "grad_norm": 0.024407967703227822, "kl": 0.07275390625, "learning_rate": 1.350187229065931e-05, "loss": 0.0029, "num_tokens": 1274584509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4478962191687292, "frac_reward_zero_std": 1.0, "grad_norm": 0.014474582470573224, "kl": 0.0802001953125, "learning_rate": 1.3496291039585221e-05, "loss": 0.0032, "num_tokens": 1275156781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4480669113254246, "frac_reward_zero_std": 1.0, "grad_norm": 0.01681156185266789, "kl": 0.07861328125, "learning_rate": 1.3490708547454024e-05, "loss": 0.0031, "num_tokens": 1275715309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.44823760348212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0660158790275715, "kl": 0.0975341796875, "learning_rate": 1.3485124816247297e-05, "loss": 0.0039, "num_tokens": 1276282957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4484082956388154, "frac_reward_zero_std": 1.0, "grad_norm": 0.029916195603957182, "kl": 0.1309814453125, "learning_rate": 1.3479539847947067e-05, "loss": 0.0052, "num_tokens": 1276850733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4485789877955108, "frac_reward_zero_std": 1.0, "grad_norm": 0.04727600830717582, "kl": 0.132568359375, "learning_rate": 1.3473953644535798e-05, "loss": 0.0053, "num_tokens": 1277417005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4487496799522062, "frac_reward_zero_std": 1.0, "grad_norm": 0.15422107247042693, "kl": 0.1649169921875, "learning_rate": 1.3468366207996386e-05, "loss": 0.0066, "num_tokens": 1277990653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4489203721089016, "frac_reward_zero_std": 1.0, "grad_norm": 0.06157524624712741, "kl": 0.155029296875, "learning_rate": 1.3462777540312171e-05, "loss": 0.0062, "num_tokens": 1278560685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.449091064265597, "frac_reward_zero_std": 1.0, "grad_norm": 0.04856900365427093, "kl": 0.169921875, "learning_rate": 1.3457187643466928e-05, "loss": 0.0068, "num_tokens": 1279132669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4492617564222924, "frac_reward_zero_std": 1.0, "grad_norm": 0.1351794051804678, "kl": 0.17431640625, "learning_rate": 1.345159651944487e-05, "loss": 0.007, "num_tokens": 1279697757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4494324485789878, "frac_reward_zero_std": 1.0, "grad_norm": 0.5701691681945604, "kl": 0.239990234375, "learning_rate": 1.3446004170230641e-05, "loss": 0.0096, "num_tokens": 1280260829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4496031407356832, "frac_reward_zero_std": 1.0, "grad_norm": 0.7952037882550935, "kl": 0.220947265625, "learning_rate": 1.3440410597809325e-05, "loss": 0.0088, "num_tokens": 1280824349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4497738328923786, "frac_reward_zero_std": 1.0, "grad_norm": 0.008897291030009564, "kl": 0.028778076171875, "learning_rate": 1.3434815804166441e-05, "loss": 0.0012, "num_tokens": 1281386285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.449944525049074, "frac_reward_zero_std": 1.0, "grad_norm": 3.742042525343349e-05, "kl": 0.0233154296875, "learning_rate": 1.3429219791287936e-05, "loss": 0.0009, "num_tokens": 1281951469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4501152172057694, "frac_reward_zero_std": 1.0, "grad_norm": 3.375520851335888e-09, "kl": 0.0233154296875, "learning_rate": 1.3423622561160193e-05, "loss": 0.0009, "num_tokens": 1282518509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4502859093624648, "frac_reward_zero_std": 1.0, "grad_norm": 5.05789909561769e-11, "kl": 0.023345947265625, "learning_rate": 1.3418024115770024e-05, "loss": 0.0009, "num_tokens": 1283085261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45045660151916017, "frac_reward_zero_std": 1.0, "grad_norm": 5.221656638665448e-12, "kl": 0.02374267578125, "learning_rate": 1.341242445710468e-05, "loss": 0.001, "num_tokens": 1283646317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4506272936758556, "frac_reward_zero_std": 1.0, "grad_norm": 6.927205541191654e-13, "kl": 0.02264404296875, "learning_rate": 1.3406823587151834e-05, "loss": 0.0009, "num_tokens": 1284207597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.450797985832551, "frac_reward_zero_std": 1.0, "grad_norm": 1.1970908732110508e-13, "kl": 0.0224609375, "learning_rate": 1.3401221507899594e-05, "loss": 0.0009, "num_tokens": 1284774573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4509686779892464, "frac_reward_zero_std": 1.0, "grad_norm": 2.9273753986700055e-14, "kl": 0.022705078125, "learning_rate": 1.3395618221336496e-05, "loss": 0.0009, "num_tokens": 1285343277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4511393701459418, "frac_reward_zero_std": 1.0, "grad_norm": 8.276702950489164e-15, "kl": 0.022979736328125, "learning_rate": 1.3390013729451502e-05, "loss": 0.0009, "num_tokens": 1285908285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4513100623026372, "frac_reward_zero_std": 1.0, "grad_norm": 3.205283549511664e-15, "kl": 0.023956298828125, "learning_rate": 1.338440803423401e-05, "loss": 0.001, "num_tokens": 1286469997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4514807544593326, "frac_reward_zero_std": 1.0, "grad_norm": 1.1971832064102088e-15, "kl": 0.02313232421875, "learning_rate": 1.3378801137673834e-05, "loss": 0.0009, "num_tokens": 1287035309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.451651446616028, "frac_reward_zero_std": 1.0, "grad_norm": 5.009922809461353e-16, "kl": 0.023040771484375, "learning_rate": 1.3373193041761218e-05, "loss": 0.0009, "num_tokens": 1287610301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45182213877272337, "frac_reward_zero_std": 1.0, "grad_norm": 2.2145543010292543e-16, "kl": 0.02276611328125, "learning_rate": 1.3367583748486846e-05, "loss": 0.0009, "num_tokens": 1288173005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4519928309294188, "frac_reward_zero_std": 1.0, "grad_norm": 1.2423142635610582e-16, "kl": 0.0238037109375, "learning_rate": 1.3361973259841798e-05, "loss": 0.001, "num_tokens": 1288735485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4521635230861142, "frac_reward_zero_std": 1.0, "grad_norm": 6.98118338603582e-17, "kl": 0.024322509765625, "learning_rate": 1.3356361577817609e-05, "loss": 0.001, "num_tokens": 1289301997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4523342152428096, "frac_reward_zero_std": 1.0, "grad_norm": 4.3334478867776034e-17, "kl": 0.024169921875, "learning_rate": 1.3350748704406213e-05, "loss": 0.001, "num_tokens": 1289869949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.452504907399505, "frac_reward_zero_std": 1.0, "grad_norm": 2.573552788447321e-17, "kl": 0.02410888671875, "learning_rate": 1.3345134641599987e-05, "loss": 0.001, "num_tokens": 1290434909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4526755995562004, "frac_reward_zero_std": 1.0, "grad_norm": 1.6366202909748123e-17, "kl": 0.023712158203125, "learning_rate": 1.3339519391391711e-05, "loss": 0.0009, "num_tokens": 1291000877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4528462917128958, "frac_reward_zero_std": 1.0, "grad_norm": 1.1307514703763686e-17, "kl": 0.02374267578125, "learning_rate": 1.3333902955774605e-05, "loss": 0.001, "num_tokens": 1291564285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4530169838695912, "frac_reward_zero_std": 1.0, "grad_norm": 7.967182000776026e-18, "kl": 0.022705078125, "learning_rate": 1.332828533674229e-05, "loss": 0.0009, "num_tokens": 1292132749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45318767602628657, "frac_reward_zero_std": 1.0, "grad_norm": 6.026401856391683e-18, "kl": 0.023345947265625, "learning_rate": 1.3322666536288833e-05, "loss": 0.0009, "num_tokens": 1292700621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.453358368182982, "frac_reward_zero_std": 1.0, "grad_norm": 4.31090705994187e-18, "kl": 0.023101806640625, "learning_rate": 1.331704655640869e-05, "loss": 0.0009, "num_tokens": 1293271917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4535290603396774, "frac_reward_zero_std": 1.0, "grad_norm": 3.601657206617946e-18, "kl": 0.0234375, "learning_rate": 1.3311425399096762e-05, "loss": 0.0009, "num_tokens": 1293841517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4536997524963728, "frac_reward_zero_std": 1.0, "grad_norm": 2.7126302166743684e-18, "kl": 0.02337646484375, "learning_rate": 1.3305803066348352e-05, "loss": 0.0009, "num_tokens": 1294403261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4538704446530682, "frac_reward_zero_std": 1.0, "grad_norm": 2.3631826576979645e-18, "kl": 0.0228271484375, "learning_rate": 1.3300179560159186e-05, "loss": 0.0009, "num_tokens": 1294961869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4540411368097636, "frac_reward_zero_std": 1.0, "grad_norm": 2.1372239623920978e-18, "kl": 0.024383544921875, "learning_rate": 1.3294554882525405e-05, "loss": 0.001, "num_tokens": 1295521565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.454211828966459, "frac_reward_zero_std": 1.0, "grad_norm": 1.8188213681170646e-18, "kl": 0.023590087890625, "learning_rate": 1.328892903544357e-05, "loss": 0.0009, "num_tokens": 1296087517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4543825211231544, "frac_reward_zero_std": 1.0, "grad_norm": 1.5677985646967445e-18, "kl": 0.024078369140625, "learning_rate": 1.3283302020910647e-05, "loss": 0.001, "num_tokens": 1296657933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45455321327984977, "frac_reward_zero_std": 1.0, "grad_norm": 1.437597686163971e-18, "kl": 0.024017333984375, "learning_rate": 1.327767384092403e-05, "loss": 0.001, "num_tokens": 1297226141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4547239054365452, "frac_reward_zero_std": 1.0, "grad_norm": 1.1597064024383553e-18, "kl": 0.023284912109375, "learning_rate": 1.3272044497481516e-05, "loss": 0.0009, "num_tokens": 1297795101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4548945975932406, "frac_reward_zero_std": 1.0, "grad_norm": 1.0278761537783987e-18, "kl": 0.02301025390625, "learning_rate": 1.326641399258132e-05, "loss": 0.0009, "num_tokens": 1298358109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.455065289749936, "frac_reward_zero_std": 1.0, "grad_norm": 9.811133079113447e-19, "kl": 0.0242919921875, "learning_rate": 1.3260782328222066e-05, "loss": 0.001, "num_tokens": 1298932541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4552359819066314, "frac_reward_zero_std": 1.0, "grad_norm": 8.23535623451846e-19, "kl": 0.022308349609375, "learning_rate": 1.3255149506402793e-05, "loss": 0.0009, "num_tokens": 1299497981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4554066740633268, "frac_reward_zero_std": 1.0, "grad_norm": 9.275813675640937e-19, "kl": 0.023956298828125, "learning_rate": 1.324951552912295e-05, "loss": 0.001, "num_tokens": 1300062205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4555773662200222, "frac_reward_zero_std": 1.0, "grad_norm": 7.905763012656781e-19, "kl": 0.02227783203125, "learning_rate": 1.3243880398382394e-05, "loss": 0.0009, "num_tokens": 1300625149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4557480583767176, "frac_reward_zero_std": 1.0, "grad_norm": 7.663589628290322e-19, "kl": 0.023040771484375, "learning_rate": 1.3238244116181396e-05, "loss": 0.0009, "num_tokens": 1301188717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45591875053341296, "frac_reward_zero_std": 1.0, "grad_norm": 6.5549466913390545e-19, "kl": 0.0235595703125, "learning_rate": 1.3232606684520627e-05, "loss": 0.0009, "num_tokens": 1301753949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45608944269010837, "frac_reward_zero_std": 1.0, "grad_norm": 6.657485399879905e-19, "kl": 0.023406982421875, "learning_rate": 1.322696810540118e-05, "loss": 0.0009, "num_tokens": 1302325533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4562601348468038, "frac_reward_zero_std": 1.0, "grad_norm": 6.01930980672701e-19, "kl": 0.023193359375, "learning_rate": 1.322132838082454e-05, "loss": 0.0009, "num_tokens": 1302891437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4564308270034992, "frac_reward_zero_std": 1.0, "grad_norm": 5.740661033116288e-19, "kl": 0.022918701171875, "learning_rate": 1.3215687512792607e-05, "loss": 0.0009, "num_tokens": 1303459117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4566015191601946, "frac_reward_zero_std": 1.0, "grad_norm": 6.048915034323402e-19, "kl": 0.023773193359375, "learning_rate": 1.3210045503307689e-05, "loss": 0.001, "num_tokens": 1304025997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45677221131689, "frac_reward_zero_std": 1.0, "grad_norm": 6.465342214295645e-19, "kl": 0.024261474609375, "learning_rate": 1.320440235437249e-05, "loss": 0.001, "num_tokens": 1304590045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4569429034735854, "frac_reward_zero_std": 1.0, "grad_norm": 5.642679615819584e-19, "kl": 0.023773193359375, "learning_rate": 1.3198758067990132e-05, "loss": 0.001, "num_tokens": 1305150573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4571135956302808, "frac_reward_zero_std": 1.0, "grad_norm": 5.224108372294325e-19, "kl": 0.023101806640625, "learning_rate": 1.3193112646164124e-05, "loss": 0.0009, "num_tokens": 1305712397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45728428778697616, "frac_reward_zero_std": 1.0, "grad_norm": 5.433034852484116e-19, "kl": 0.023406982421875, "learning_rate": 1.3187466090898396e-05, "loss": 0.0009, "num_tokens": 1306279165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45745497994367157, "frac_reward_zero_std": 1.0, "grad_norm": 5.476845889479932e-19, "kl": 0.0234375, "learning_rate": 1.3181818404197262e-05, "loss": 0.0009, "num_tokens": 1306848701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.457625672100367, "frac_reward_zero_std": 1.0, "grad_norm": 5.470886590033078e-19, "kl": 0.023834228515625, "learning_rate": 1.3176169588065456e-05, "loss": 0.001, "num_tokens": 1307415037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4577963642570624, "frac_reward_zero_std": 1.0, "grad_norm": 5.008407286828688e-19, "kl": 0.023468017578125, "learning_rate": 1.3170519644508095e-05, "loss": 0.0009, "num_tokens": 1307983277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4579670564137578, "frac_reward_zero_std": 1.0, "grad_norm": 5.546483435453364e-19, "kl": 0.0242919921875, "learning_rate": 1.3164868575530714e-05, "loss": 0.001, "num_tokens": 1308553965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4581377485704532, "frac_reward_zero_std": 1.0, "grad_norm": 4.699274810030157e-19, "kl": 0.02301025390625, "learning_rate": 1.3159216383139228e-05, "loss": 0.0009, "num_tokens": 1309122573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4583084407271486, "frac_reward_zero_std": 1.0, "grad_norm": 4.924831086017116e-19, "kl": 0.024383544921875, "learning_rate": 1.3153563069339975e-05, "loss": 0.001, "num_tokens": 1309688557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.458479132883844, "frac_reward_zero_std": 1.0, "grad_norm": 5.291021429486672e-19, "kl": 0.023773193359375, "learning_rate": 1.3147908636139662e-05, "loss": 0.001, "num_tokens": 1310249869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45864982504053936, "frac_reward_zero_std": 1.0, "grad_norm": 5.115230479858268e-19, "kl": 0.024658203125, "learning_rate": 1.314225308554542e-05, "loss": 0.001, "num_tokens": 1310817437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45882051719723477, "frac_reward_zero_std": 1.0, "grad_norm": 5.012225513860485e-19, "kl": 0.023406982421875, "learning_rate": 1.313659641956476e-05, "loss": 0.0009, "num_tokens": 1311380989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4589912093539302, "frac_reward_zero_std": 1.0, "grad_norm": 5.040927572202616e-19, "kl": 0.024017333984375, "learning_rate": 1.3130938640205595e-05, "loss": 0.001, "num_tokens": 1311942141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4591619015106256, "frac_reward_zero_std": 1.0, "grad_norm": 4.723510218706566e-19, "kl": 0.02410888671875, "learning_rate": 1.3125279749476235e-05, "loss": 0.001, "num_tokens": 1312505885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.459332593667321, "frac_reward_zero_std": 1.0, "grad_norm": 4.729571198924904e-19, "kl": 0.0228271484375, "learning_rate": 1.3119619749385379e-05, "loss": 0.0009, "num_tokens": 1313068861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4595032858240164, "frac_reward_zero_std": 1.0, "grad_norm": 4.629682293991753e-19, "kl": 0.02362060546875, "learning_rate": 1.3113958641942124e-05, "loss": 0.0009, "num_tokens": 1313632237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4596739779807118, "frac_reward_zero_std": 1.0, "grad_norm": 4.679222896902072e-19, "kl": 0.02313232421875, "learning_rate": 1.310829642915596e-05, "loss": 0.0009, "num_tokens": 1314206525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4598446701374072, "frac_reward_zero_std": 1.0, "grad_norm": 4.3064568206122134e-19, "kl": 0.022674560546875, "learning_rate": 1.3102633113036764e-05, "loss": 0.0009, "num_tokens": 1314775037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46001536229410256, "frac_reward_zero_std": 1.0, "grad_norm": 4.692992667751783e-19, "kl": 0.0233154296875, "learning_rate": 1.3096968695594814e-05, "loss": 0.0009, "num_tokens": 1315341997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46018605445079797, "frac_reward_zero_std": 1.0, "grad_norm": 4.2703945120570074e-19, "kl": 0.023895263671875, "learning_rate": 1.309130317884077e-05, "loss": 0.001, "num_tokens": 1315902957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4603567466074934, "frac_reward_zero_std": 1.0, "grad_norm": 4.792259554140622e-19, "kl": 0.02337646484375, "learning_rate": 1.3085636564785686e-05, "loss": 0.0009, "num_tokens": 1316471949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4605274387641888, "frac_reward_zero_std": 1.0, "grad_norm": 4.579931124010464e-19, "kl": 0.023651123046875, "learning_rate": 1.307996885544101e-05, "loss": 0.0009, "num_tokens": 1317038205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4606981309208842, "frac_reward_zero_std": 1.0, "grad_norm": 4.170145647570937e-19, "kl": 0.023345947265625, "learning_rate": 1.3074300052818567e-05, "loss": 0.0009, "num_tokens": 1317615085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4608688230775796, "frac_reward_zero_std": 1.0, "grad_norm": 4.183579229767402e-19, "kl": 0.02288818359375, "learning_rate": 1.3068630158930583e-05, "loss": 0.0009, "num_tokens": 1318178141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.461039515234275, "frac_reward_zero_std": 1.0, "grad_norm": 4.569607290771098e-19, "kl": 0.023773193359375, "learning_rate": 1.3062959175789665e-05, "loss": 0.0009, "num_tokens": 1318744397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4612102073909704, "frac_reward_zero_std": 1.0, "grad_norm": 4.222597004503559e-19, "kl": 0.0234375, "learning_rate": 1.3057287105408802e-05, "loss": 0.0009, "num_tokens": 1319307533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46138089954766576, "frac_reward_zero_std": 1.0, "grad_norm": 4.341035494727734e-19, "kl": 0.02325439453125, "learning_rate": 1.3051613949801382e-05, "loss": 0.0009, "num_tokens": 1319868973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46155159170436116, "frac_reward_zero_std": 1.0, "grad_norm": 4.782559479383211e-19, "kl": 0.02325439453125, "learning_rate": 1.3045939710981165e-05, "loss": 0.0009, "num_tokens": 1320439229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46172228386105657, "frac_reward_zero_std": 1.0, "grad_norm": 4.687179558735682e-19, "kl": 0.0235595703125, "learning_rate": 1.3040264390962305e-05, "loss": 0.0009, "num_tokens": 1321010989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.461892976017752, "frac_reward_zero_std": 1.0, "grad_norm": 4.651103765821333e-19, "kl": 0.025238037109375, "learning_rate": 1.3034587991759331e-05, "loss": 0.001, "num_tokens": 1321579965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4620636681744474, "frac_reward_zero_std": 1.0, "grad_norm": 4.582223195296763e-19, "kl": 0.02374267578125, "learning_rate": 1.3028910515387164e-05, "loss": 0.0009, "num_tokens": 1322145949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4622343603311428, "frac_reward_zero_std": 1.0, "grad_norm": 4.386541514280728e-19, "kl": 0.022735595703125, "learning_rate": 1.30232319638611e-05, "loss": 0.0009, "num_tokens": 1322716989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4624050524878382, "frac_reward_zero_std": 1.0, "grad_norm": 4.452042651555228e-19, "kl": 0.023162841796875, "learning_rate": 1.3017552339196821e-05, "loss": 0.0009, "num_tokens": 1323282797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4625757446445336, "frac_reward_zero_std": 1.0, "grad_norm": 4.833687154603487e-19, "kl": 0.023712158203125, "learning_rate": 1.3011871643410391e-05, "loss": 0.0009, "num_tokens": 1323845165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.462746436801229, "frac_reward_zero_std": 1.0, "grad_norm": 4.3016379792100713e-19, "kl": 0.022796630859375, "learning_rate": 1.3006189878518249e-05, "loss": 0.0009, "num_tokens": 1324410973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46291712895792436, "frac_reward_zero_std": 1.0, "grad_norm": 4.399831101000668e-19, "kl": 0.023193359375, "learning_rate": 1.3000507046537216e-05, "loss": 0.0009, "num_tokens": 1324979821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46308782111461977, "frac_reward_zero_std": 1.0, "grad_norm": 4.009020830752473e-19, "kl": 0.022674560546875, "learning_rate": 1.2994823149484494e-05, "loss": 0.0009, "num_tokens": 1325539133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4632585132713152, "frac_reward_zero_std": 1.0, "grad_norm": 4.812863962315828e-19, "kl": 0.0244140625, "learning_rate": 1.2989138189377664e-05, "loss": 0.001, "num_tokens": 1326106781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4634292054280106, "frac_reward_zero_std": 1.0, "grad_norm": 4.399115288771886e-19, "kl": 0.022918701171875, "learning_rate": 1.2983452168234676e-05, "loss": 0.0009, "num_tokens": 1326673245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.463599897584706, "frac_reward_zero_std": 1.0, "grad_norm": 4.392294849312362e-19, "kl": 0.022796630859375, "learning_rate": 1.2977765088073863e-05, "loss": 0.0009, "num_tokens": 1327245549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4637705897414014, "frac_reward_zero_std": 1.0, "grad_norm": 4.2023956358982533e-19, "kl": 0.022857666015625, "learning_rate": 1.2972076950913937e-05, "loss": 0.0009, "num_tokens": 1327817229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4639412818980968, "frac_reward_zero_std": 1.0, "grad_norm": 4.472382911038726e-19, "kl": 0.02423095703125, "learning_rate": 1.2966387758773979e-05, "loss": 0.001, "num_tokens": 1328386141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4641119740547922, "frac_reward_zero_std": 1.0, "grad_norm": 4.358049444823963e-19, "kl": 0.023345947265625, "learning_rate": 1.2960697513673444e-05, "loss": 0.0009, "num_tokens": 1328961389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46428266621148756, "frac_reward_zero_std": 1.0, "grad_norm": 4.426684099971953e-19, "kl": 0.023956298828125, "learning_rate": 1.2955006217632172e-05, "loss": 0.001, "num_tokens": 1329541453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46445335836818297, "frac_reward_zero_std": 1.0, "grad_norm": 4.311163400950503e-19, "kl": 0.023406982421875, "learning_rate": 1.2949313872670356e-05, "loss": 0.0009, "num_tokens": 1330107885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4646240505248784, "frac_reward_zero_std": 1.0, "grad_norm": 4.827849909280804e-19, "kl": 0.024078369140625, "learning_rate": 1.2943620480808587e-05, "loss": 0.001, "num_tokens": 1330677485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4647947426815738, "frac_reward_zero_std": 1.0, "grad_norm": 3.9886512570073496e-19, "kl": 0.02178955078125, "learning_rate": 1.2937926044067802e-05, "loss": 0.0009, "num_tokens": 1331250413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4649654348382692, "frac_reward_zero_std": 1.0, "grad_norm": 4.373780258997711e-19, "kl": 0.023162841796875, "learning_rate": 1.2932230564469324e-05, "loss": 0.0009, "num_tokens": 1331815613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4651361269949646, "frac_reward_zero_std": 1.0, "grad_norm": 4.9567728525461315e-19, "kl": 0.02490234375, "learning_rate": 1.2926534044034841e-05, "loss": 0.001, "num_tokens": 1332378797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46530681915166, "frac_reward_zero_std": 1.0, "grad_norm": 4.135314723984265e-19, "kl": 0.02386474609375, "learning_rate": 1.2920836484786419e-05, "loss": 0.001, "num_tokens": 1332952637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4654775113083554, "frac_reward_zero_std": 1.0, "grad_norm": 4.1199470874039943e-19, "kl": 0.02191162109375, "learning_rate": 1.2915137888746478e-05, "loss": 0.0009, "num_tokens": 1333520109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46564820346505076, "frac_reward_zero_std": 1.0, "grad_norm": 4.475183537584393e-19, "kl": 0.024169921875, "learning_rate": 1.2909438257937819e-05, "loss": 0.001, "num_tokens": 1334087133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46581889562174617, "frac_reward_zero_std": 1.0, "grad_norm": 4.082480639197645e-19, "kl": 0.02313232421875, "learning_rate": 1.2903737594383601e-05, "loss": 0.0009, "num_tokens": 1334658653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46598958777844157, "frac_reward_zero_std": 1.0, "grad_norm": 4.453913804040411e-19, "kl": 0.0230712890625, "learning_rate": 1.2898035900107356e-05, "loss": 0.0009, "num_tokens": 1335224237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.466160279935137, "frac_reward_zero_std": 1.0, "grad_norm": 4.507006375382334e-19, "kl": 0.022552490234375, "learning_rate": 1.289233317713298e-05, "loss": 0.0009, "num_tokens": 1335789613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4663309720918324, "frac_reward_zero_std": 1.0, "grad_norm": 3.957788655816972e-19, "kl": 0.022003173828125, "learning_rate": 1.2886629427484734e-05, "loss": 0.0009, "num_tokens": 1336353085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4665016642485278, "frac_reward_zero_std": 1.0, "grad_norm": 5.076947220502001e-19, "kl": 0.02459716796875, "learning_rate": 1.2880924653187243e-05, "loss": 0.001, "num_tokens": 1336917069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4666723564052232, "frac_reward_zero_std": 1.0, "grad_norm": 4.74192227586433e-19, "kl": 0.02374267578125, "learning_rate": 1.2875218856265494e-05, "loss": 0.0009, "num_tokens": 1337477709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4668430485619186, "frac_reward_zero_std": 1.0, "grad_norm": 4.0097794068172896e-19, "kl": 0.022613525390625, "learning_rate": 1.2869512038744843e-05, "loss": 0.0009, "num_tokens": 1338038637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46701374071861396, "frac_reward_zero_std": 1.0, "grad_norm": 4.266695552756969e-19, "kl": 0.02325439453125, "learning_rate": 1.2863804202650998e-05, "loss": 0.0009, "num_tokens": 1338604397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46718443287530936, "frac_reward_zero_std": 1.0, "grad_norm": 4.607603780247099e-19, "kl": 0.022796630859375, "learning_rate": 1.285809535001004e-05, "loss": 0.0009, "num_tokens": 1339169005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46735512503200477, "frac_reward_zero_std": 1.0, "grad_norm": 4.070038920929056e-19, "kl": 0.022918701171875, "learning_rate": 1.2852385482848404e-05, "loss": 0.0009, "num_tokens": 1339735997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4675258171887002, "frac_reward_zero_std": 1.0, "grad_norm": 4.307759741578793e-19, "kl": 0.023773193359375, "learning_rate": 1.2846674603192888e-05, "loss": 0.001, "num_tokens": 1340302125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4676965093453956, "frac_reward_zero_std": 1.0, "grad_norm": 4.456429017596958e-19, "kl": 0.023345947265625, "learning_rate": 1.2840962713070645e-05, "loss": 0.0009, "num_tokens": 1340874813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.467867201502091, "frac_reward_zero_std": 1.0, "grad_norm": 4.451046814706384e-19, "kl": 0.023345947265625, "learning_rate": 1.2835249814509194e-05, "loss": 0.0009, "num_tokens": 1341437485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4680378936587864, "frac_reward_zero_std": 1.0, "grad_norm": 4.385279365083815e-19, "kl": 0.023712158203125, "learning_rate": 1.2829535909536403e-05, "loss": 0.001, "num_tokens": 1341999213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4682085858154818, "frac_reward_zero_std": 1.0, "grad_norm": 4.1380280088050583e-19, "kl": 0.022369384765625, "learning_rate": 1.2823821000180508e-05, "loss": 0.0009, "num_tokens": 1342560749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46837927797217715, "frac_reward_zero_std": 1.0, "grad_norm": 4.093343214777541e-19, "kl": 0.0225830078125, "learning_rate": 1.2818105088470088e-05, "loss": 0.0009, "num_tokens": 1343121421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46854997012887256, "frac_reward_zero_std": 1.0, "grad_norm": 4.230164311520619e-19, "kl": 0.023040771484375, "learning_rate": 1.2812388176434091e-05, "loss": 0.0009, "num_tokens": 1343686365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46872066228556797, "frac_reward_zero_std": 1.0, "grad_norm": 4.3073050389447213e-19, "kl": 0.022552490234375, "learning_rate": 1.2806670266101814e-05, "loss": 0.0009, "num_tokens": 1344247197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4688913544422634, "frac_reward_zero_std": 1.0, "grad_norm": 3.9154464019676725e-19, "kl": 0.023223876953125, "learning_rate": 1.2800951359502907e-05, "loss": 0.0009, "num_tokens": 1344812717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4690620465989588, "frac_reward_zero_std": 1.0, "grad_norm": 4.1304273720088017e-19, "kl": 0.022613525390625, "learning_rate": 1.2795231458667374e-05, "loss": 0.0009, "num_tokens": 1345376957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4692327387556542, "frac_reward_zero_std": 1.0, "grad_norm": 4.429338542229245e-19, "kl": 0.022369384765625, "learning_rate": 1.2789510565625576e-05, "loss": 0.0009, "num_tokens": 1345942621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4694034309123496, "frac_reward_zero_std": 1.0, "grad_norm": 4.2663780962080487e-19, "kl": 0.022705078125, "learning_rate": 1.2783788682408225e-05, "loss": 0.0009, "num_tokens": 1346512397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.469574123069045, "frac_reward_zero_std": 1.0, "grad_norm": 4.065223690865731e-19, "kl": 0.022918701171875, "learning_rate": 1.2778065811046376e-05, "loss": 0.0009, "num_tokens": 1347080765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46974481522574035, "frac_reward_zero_std": 1.0, "grad_norm": 4.714736798822786e-19, "kl": 0.02410888671875, "learning_rate": 1.2772341953571452e-05, "loss": 0.001, "num_tokens": 1347644061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46991550738243576, "frac_reward_zero_std": 1.0, "grad_norm": 4.370110817439423e-19, "kl": 0.023040771484375, "learning_rate": 1.2766617112015207e-05, "loss": 0.0009, "num_tokens": 1348207933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47008619953913117, "frac_reward_zero_std": 1.0, "grad_norm": 4.700818673474757e-19, "kl": 0.024017333984375, "learning_rate": 1.2760891288409759e-05, "loss": 0.001, "num_tokens": 1348772013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4702568916958266, "frac_reward_zero_std": 1.0, "grad_norm": 4.524774649615008e-19, "kl": 0.022857666015625, "learning_rate": 1.2755164484787562e-05, "loss": 0.0009, "num_tokens": 1349337661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.470427583852522, "frac_reward_zero_std": 1.0, "grad_norm": 4.283184714469236e-19, "kl": 0.02313232421875, "learning_rate": 1.2749436703181432e-05, "loss": 0.0009, "num_tokens": 1349907021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4705982760092174, "frac_reward_zero_std": 1.0, "grad_norm": 4.563993488307402e-19, "kl": 0.02362060546875, "learning_rate": 1.274370794562452e-05, "loss": 0.0009, "num_tokens": 1350474173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4707689681659128, "frac_reward_zero_std": 1.0, "grad_norm": 4.480404149801116e-19, "kl": 0.02447509765625, "learning_rate": 1.2737978214150331e-05, "loss": 0.001, "num_tokens": 1351035757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4709396603226082, "frac_reward_zero_std": 1.0, "grad_norm": 4.370654375561559e-19, "kl": 0.0233154296875, "learning_rate": 1.2732247510792707e-05, "loss": 0.0009, "num_tokens": 1351603965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47111035247930355, "frac_reward_zero_std": 1.0, "grad_norm": 4.1368593025736046e-19, "kl": 0.02349853515625, "learning_rate": 1.2726515837585852e-05, "loss": 0.0009, "num_tokens": 1352180237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47128104463599896, "frac_reward_zero_std": 1.0, "grad_norm": 4.645107889430194e-19, "kl": 0.023651123046875, "learning_rate": 1.272078319656429e-05, "loss": 0.0009, "num_tokens": 1352754317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47145173679269436, "frac_reward_zero_std": 1.0, "grad_norm": 4.3004464948772306e-19, "kl": 0.02276611328125, "learning_rate": 1.2715049589762909e-05, "loss": 0.0009, "num_tokens": 1353320173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47162242894938977, "frac_reward_zero_std": 1.0, "grad_norm": 4.3243738694251345e-19, "kl": 0.023529052734375, "learning_rate": 1.2709315019216933e-05, "loss": 0.0009, "num_tokens": 1353885437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4717931211060852, "frac_reward_zero_std": 1.0, "grad_norm": 4.491061945350095e-19, "kl": 0.023834228515625, "learning_rate": 1.2703579486961925e-05, "loss": 0.001, "num_tokens": 1354449181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4719638132627806, "frac_reward_zero_std": 1.0, "grad_norm": 4.586172875610258e-19, "kl": 0.02459716796875, "learning_rate": 1.2697842995033793e-05, "loss": 0.001, "num_tokens": 1355012541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.472134505419476, "frac_reward_zero_std": 1.0, "grad_norm": 4.379192201633571e-19, "kl": 0.0233154296875, "learning_rate": 1.2692105545468785e-05, "loss": 0.0009, "num_tokens": 1355573773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4723051975761714, "frac_reward_zero_std": 1.0, "grad_norm": 4.636412465217707e-19, "kl": 0.023590087890625, "learning_rate": 1.2686367140303489e-05, "loss": 0.0009, "num_tokens": 1356136749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47247588973286675, "frac_reward_zero_std": 1.0, "grad_norm": 4.1075264706179756e-19, "kl": 0.02252197265625, "learning_rate": 1.2680627781574828e-05, "loss": 0.0009, "num_tokens": 1356702813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47264658188956216, "frac_reward_zero_std": 1.0, "grad_norm": 4.46929269446364e-19, "kl": 0.02386474609375, "learning_rate": 1.267488747132007e-05, "loss": 0.001, "num_tokens": 1357273645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47281727404625756, "frac_reward_zero_std": 1.0, "grad_norm": 4.2152161918469625e-19, "kl": 0.0238037109375, "learning_rate": 1.2669146211576817e-05, "loss": 0.001, "num_tokens": 1357842093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47298796620295297, "frac_reward_zero_std": 1.0, "grad_norm": 4.075055548861378e-19, "kl": 0.02325439453125, "learning_rate": 1.2663404004383012e-05, "loss": 0.0009, "num_tokens": 1358406013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4731586583596484, "frac_reward_zero_std": 1.0, "grad_norm": 4.47913759720276e-19, "kl": 0.02301025390625, "learning_rate": 1.2657660851776928e-05, "loss": 0.0009, "num_tokens": 1358971021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4733293505163438, "frac_reward_zero_std": 1.0, "grad_norm": 4.372863107965354e-19, "kl": 0.02374267578125, "learning_rate": 1.2651916755797176e-05, "loss": 0.001, "num_tokens": 1359538013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4735000426730392, "frac_reward_zero_std": 1.0, "grad_norm": 4.356652922590914e-19, "kl": 0.023284912109375, "learning_rate": 1.2646171718482706e-05, "loss": 0.0009, "num_tokens": 1360103469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4736707348297346, "frac_reward_zero_std": 1.0, "grad_norm": 4.6092048007233295e-19, "kl": 0.0235595703125, "learning_rate": 1.2640425741872796e-05, "loss": 0.0009, "num_tokens": 1360667549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47384142698642995, "frac_reward_zero_std": 1.0, "grad_norm": 4.549893457887065e-19, "kl": 0.02386474609375, "learning_rate": 1.263467882800706e-05, "loss": 0.001, "num_tokens": 1361231421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47401211914312535, "frac_reward_zero_std": 1.0, "grad_norm": 4.3305262355550054e-19, "kl": 0.022918701171875, "learning_rate": 1.262893097892545e-05, "loss": 0.0009, "num_tokens": 1361794429.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47418281129982076, "frac_reward_zero_std": 1.0, "grad_norm": 4.2637469377946236e-19, "kl": 0.02313232421875, "learning_rate": 1.262318219666824e-05, "loss": 0.0009, "num_tokens": 1362368077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47435350345651617, "frac_reward_zero_std": 1.0, "grad_norm": 4.610618822541097e-19, "kl": 0.0238037109375, "learning_rate": 1.261743248327604e-05, "loss": 0.001, "num_tokens": 1362928557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4745241956132116, "frac_reward_zero_std": 1.0, "grad_norm": 4.580430697647529e-19, "kl": 0.024505615234375, "learning_rate": 1.2611681840789796e-05, "loss": 0.001, "num_tokens": 1363494029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.474694887769907, "frac_reward_zero_std": 1.0, "grad_norm": 4.321962032555711e-19, "kl": 0.023345947265625, "learning_rate": 1.2605930271250771e-05, "loss": 0.0009, "num_tokens": 1364063725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4748655799266024, "frac_reward_zero_std": 1.0, "grad_norm": 4.478232988180624e-19, "kl": 0.024078369140625, "learning_rate": 1.2600177776700573e-05, "loss": 0.001, "num_tokens": 1364631629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4750362720832978, "frac_reward_zero_std": 1.0, "grad_norm": 3.8652346623525696e-19, "kl": 0.022979736328125, "learning_rate": 1.259442435918112e-05, "loss": 0.0009, "num_tokens": 1365196781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47520696423999315, "frac_reward_zero_std": 1.0, "grad_norm": 5.008890293760817e-19, "kl": 0.02508544921875, "learning_rate": 1.2588670020734677e-05, "loss": 0.001, "num_tokens": 1365765133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47537765639668855, "frac_reward_zero_std": 1.0, "grad_norm": 4.351372599439898e-19, "kl": 0.02288818359375, "learning_rate": 1.2582914763403817e-05, "loss": 0.0009, "num_tokens": 1366333245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47554834855338396, "frac_reward_zero_std": 1.0, "grad_norm": 4.3115341022714586e-19, "kl": 0.02349853515625, "learning_rate": 1.2577158589231462e-05, "loss": 0.0009, "num_tokens": 1366901005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47571904071007937, "frac_reward_zero_std": 1.0, "grad_norm": 4.045240548338408e-19, "kl": 0.022308349609375, "learning_rate": 1.2571401500260829e-05, "loss": 0.0009, "num_tokens": 1367466685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4758897328667748, "frac_reward_zero_std": 1.0, "grad_norm": 4.617209071181809e-19, "kl": 0.02349853515625, "learning_rate": 1.2565643498535493e-05, "loss": 0.0009, "num_tokens": 1368028797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4760604250234702, "frac_reward_zero_std": 1.0, "grad_norm": 4.523887791186323e-19, "kl": 0.02374267578125, "learning_rate": 1.2559884586099324e-05, "loss": 0.0009, "num_tokens": 1368600109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4762311171801656, "frac_reward_zero_std": 1.0, "grad_norm": 4.287512867078321e-19, "kl": 0.022735595703125, "learning_rate": 1.2554124764996539e-05, "loss": 0.0009, "num_tokens": 1369162253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.476401809336861, "frac_reward_zero_std": 1.0, "grad_norm": 3.859204141735252e-19, "kl": 0.023956298828125, "learning_rate": 1.2548364037271657e-05, "loss": 0.001, "num_tokens": 1369723757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47657250149355634, "frac_reward_zero_std": 1.0, "grad_norm": 3.810371326591559e-19, "kl": 0.02337646484375, "learning_rate": 1.254260240496953e-05, "loss": 0.0009, "num_tokens": 1370298957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47674319365025175, "frac_reward_zero_std": 1.0, "grad_norm": 4.514427071120072e-19, "kl": 0.02374267578125, "learning_rate": 1.2536839870135337e-05, "loss": 0.001, "num_tokens": 1370861421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47691388580694716, "frac_reward_zero_std": 1.0, "grad_norm": 4.292787846684544e-19, "kl": 0.023834228515625, "learning_rate": 1.2531076434814562e-05, "loss": 0.001, "num_tokens": 1371427405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47708457796364256, "frac_reward_zero_std": 1.0, "grad_norm": 4.554176261635631e-19, "kl": 0.023406982421875, "learning_rate": 1.2525312101053022e-05, "loss": 0.0009, "num_tokens": 1371989517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47725527012033797, "frac_reward_zero_std": 1.0, "grad_norm": 3.960503791828266e-19, "kl": 0.022491455078125, "learning_rate": 1.2519546870896844e-05, "loss": 0.0009, "num_tokens": 1372553133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4774259622770334, "frac_reward_zero_std": 1.0, "grad_norm": 4.2693675622664835e-19, "kl": 0.0224609375, "learning_rate": 1.2513780746392474e-05, "loss": 0.0009, "num_tokens": 1373131117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4775966544337288, "frac_reward_zero_std": 1.0, "grad_norm": 4.730471377155629e-19, "kl": 0.0238037109375, "learning_rate": 1.2508013729586686e-05, "loss": 0.001, "num_tokens": 1373699965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4777673465904242, "frac_reward_zero_std": 1.0, "grad_norm": 4.666508073357342e-19, "kl": 0.023193359375, "learning_rate": 1.2502245822526553e-05, "loss": 0.0009, "num_tokens": 1374264909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47793803874711954, "frac_reward_zero_std": 1.0, "grad_norm": 4.522831081715798e-19, "kl": 0.023223876953125, "learning_rate": 1.2496477027259482e-05, "loss": 0.0009, "num_tokens": 1374831165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47810873090381495, "frac_reward_zero_std": 1.0, "grad_norm": 4.967296382586625e-19, "kl": 0.02447509765625, "learning_rate": 1.2490707345833184e-05, "loss": 0.001, "num_tokens": 1375395117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47827942306051036, "frac_reward_zero_std": 1.0, "grad_norm": 4.1703772037193154e-19, "kl": 0.0225830078125, "learning_rate": 1.2484936780295687e-05, "loss": 0.0009, "num_tokens": 1375961197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47845011521720576, "frac_reward_zero_std": 1.0, "grad_norm": 4.04701280829107e-19, "kl": 0.0228271484375, "learning_rate": 1.2479165332695333e-05, "loss": 0.0009, "num_tokens": 1376527949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47862080737390117, "frac_reward_zero_std": 1.0, "grad_norm": 3.975581349652593e-19, "kl": 0.0230712890625, "learning_rate": 1.247339300508078e-05, "loss": 0.0009, "num_tokens": 1377099869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4787914995305966, "frac_reward_zero_std": 1.0, "grad_norm": 4.622291776375038e-19, "kl": 0.023529052734375, "learning_rate": 1.2467619799500996e-05, "loss": 0.0009, "num_tokens": 1377670365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.478962191687292, "frac_reward_zero_std": 1.0, "grad_norm": 4.616753558354282e-19, "kl": 0.02410888671875, "learning_rate": 1.2461845718005257e-05, "loss": 0.001, "num_tokens": 1378233997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4791328838439874, "frac_reward_zero_std": 1.0, "grad_norm": 4.365000302517271e-19, "kl": 0.023529052734375, "learning_rate": 1.2456070762643155e-05, "loss": 0.0009, "num_tokens": 1378796925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4793035760006828, "frac_reward_zero_std": 1.0, "grad_norm": 4.500400645120845e-19, "kl": 0.023101806640625, "learning_rate": 1.2450294935464593e-05, "loss": 0.0009, "num_tokens": 1379357501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47947426815737815, "frac_reward_zero_std": 1.0, "grad_norm": 4.3258320908000057e-19, "kl": 0.02313232421875, "learning_rate": 1.244451823851978e-05, "loss": 0.0009, "num_tokens": 1379921965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47964496031407355, "frac_reward_zero_std": 1.0, "grad_norm": 4.600080330954181e-19, "kl": 0.022308349609375, "learning_rate": 1.2438740673859232e-05, "loss": 0.0009, "num_tokens": 1380491229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47981565247076896, "frac_reward_zero_std": 1.0, "grad_norm": 4.200456033379515e-19, "kl": 0.0235595703125, "learning_rate": 1.2432962243533777e-05, "loss": 0.0009, "num_tokens": 1381054573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47998634462746437, "frac_reward_zero_std": 1.0, "grad_norm": 4.2773880065482274e-19, "kl": 0.023468017578125, "learning_rate": 1.2427182949594555e-05, "loss": 0.0009, "num_tokens": 1381624301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4801570367841598, "frac_reward_zero_std": 1.0, "grad_norm": 4.488175673218311e-19, "kl": 0.022979736328125, "learning_rate": 1.2421402794092998e-05, "loss": 0.0009, "num_tokens": 1382192877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4803277289408552, "frac_reward_zero_std": 1.0, "grad_norm": 4.605825856548779e-19, "kl": 0.024169921875, "learning_rate": 1.2415621779080862e-05, "loss": 0.001, "num_tokens": 1382753517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4804984210975506, "frac_reward_zero_std": 1.0, "grad_norm": 4.520400759969877e-19, "kl": 0.023956298828125, "learning_rate": 1.2409839906610188e-05, "loss": 0.001, "num_tokens": 1383316333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.480669113254246, "frac_reward_zero_std": 1.0, "grad_norm": 4.2902046909467883e-19, "kl": 0.0242919921875, "learning_rate": 1.2404057178733341e-05, "loss": 0.001, "num_tokens": 1383883565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48083980541094135, "frac_reward_zero_std": 1.0, "grad_norm": 4.219358270813182e-19, "kl": 0.023223876953125, "learning_rate": 1.2398273597502974e-05, "loss": 0.0009, "num_tokens": 1384451933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48101049756763675, "frac_reward_zero_std": 1.0, "grad_norm": 3.9506813831260183e-19, "kl": 0.02276611328125, "learning_rate": 1.2392489164972057e-05, "loss": 0.0009, "num_tokens": 1385016909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48118118972433216, "frac_reward_zero_std": 1.0, "grad_norm": 4.26692868545244e-19, "kl": 0.023223876953125, "learning_rate": 1.2386703883193845e-05, "loss": 0.0009, "num_tokens": 1385585117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48135188188102757, "frac_reward_zero_std": 1.0, "grad_norm": 4.244169562391535e-19, "kl": 0.02252197265625, "learning_rate": 1.2380917754221914e-05, "loss": 0.0009, "num_tokens": 1386154317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.481522574037723, "frac_reward_zero_std": 1.0, "grad_norm": 4.387079051742701e-19, "kl": 0.02325439453125, "learning_rate": 1.2375130780110122e-05, "loss": 0.0009, "num_tokens": 1386722461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4816932661944184, "frac_reward_zero_std": 1.0, "grad_norm": 4.1548354302625953e-19, "kl": 0.02325439453125, "learning_rate": 1.2369342962912643e-05, "loss": 0.0009, "num_tokens": 1387291645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4818639583511138, "frac_reward_zero_std": 1.0, "grad_norm": 4.3787504118684865e-19, "kl": 0.02313232421875, "learning_rate": 1.236355430468394e-05, "loss": 0.0009, "num_tokens": 1387858157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4820346505078092, "frac_reward_zero_std": 1.0, "grad_norm": 4.2144141525043593e-19, "kl": 0.02288818359375, "learning_rate": 1.235776480747878e-05, "loss": 0.0009, "num_tokens": 1388425597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48220534266450454, "frac_reward_zero_std": 1.0, "grad_norm": 4.269901972751724e-19, "kl": 0.02362060546875, "learning_rate": 1.2351974473352221e-05, "loss": 0.0009, "num_tokens": 1388986301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48237603482119995, "frac_reward_zero_std": 1.0, "grad_norm": 4.622456817468769e-19, "kl": 0.02392578125, "learning_rate": 1.234618330435963e-05, "loss": 0.001, "num_tokens": 1389552749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48254672697789536, "frac_reward_zero_std": 1.0, "grad_norm": 5.207455831110226e-19, "kl": 0.023773193359375, "learning_rate": 1.2340391302556654e-05, "loss": 0.0009, "num_tokens": 1390115197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48271741913459076, "frac_reward_zero_std": 1.0, "grad_norm": 4.473024601576931e-19, "kl": 0.023529052734375, "learning_rate": 1.2334598469999248e-05, "loss": 0.0009, "num_tokens": 1390683853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48288811129128617, "frac_reward_zero_std": 1.0, "grad_norm": 4.117129996996879e-19, "kl": 0.023193359375, "learning_rate": 1.2328804808743662e-05, "loss": 0.0009, "num_tokens": 1391249949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4830588034479816, "frac_reward_zero_std": 1.0, "grad_norm": 4.345568995852437e-19, "kl": 0.022216796875, "learning_rate": 1.2323010320846434e-05, "loss": 0.0009, "num_tokens": 1391812685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.483229495604677, "frac_reward_zero_std": 1.0, "grad_norm": 4.436226331408483e-19, "kl": 0.0233154296875, "learning_rate": 1.2317215008364397e-05, "loss": 0.0009, "num_tokens": 1392382893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4834001877613724, "frac_reward_zero_std": 1.0, "grad_norm": 4.3006214213268223e-19, "kl": 0.02349853515625, "learning_rate": 1.2311418873354679e-05, "loss": 0.0009, "num_tokens": 1392946045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48357087991806774, "frac_reward_zero_std": 1.0, "grad_norm": 4.016870143818719e-19, "kl": 0.022552490234375, "learning_rate": 1.2305621917874697e-05, "loss": 0.0009, "num_tokens": 1393508813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48374157207476315, "frac_reward_zero_std": 1.0, "grad_norm": 4.103635774241751e-19, "kl": 0.022705078125, "learning_rate": 1.2299824143982165e-05, "loss": 0.0009, "num_tokens": 1394085677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48391226423145856, "frac_reward_zero_std": 1.0, "grad_norm": 4.46107167667224e-19, "kl": 0.023529052734375, "learning_rate": 1.2294025553735078e-05, "loss": 0.0009, "num_tokens": 1394654093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48408295638815396, "frac_reward_zero_std": 1.0, "grad_norm": 4.150492222485848e-19, "kl": 0.0238037109375, "learning_rate": 1.2288226149191731e-05, "loss": 0.001, "num_tokens": 1395221149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48425364854484937, "frac_reward_zero_std": 1.0, "grad_norm": 4.342624800083074e-19, "kl": 0.022735595703125, "learning_rate": 1.22824259324107e-05, "loss": 0.0009, "num_tokens": 1395784957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4844243407015448, "frac_reward_zero_std": 1.0, "grad_norm": 4.352923086222807e-19, "kl": 0.0240478515625, "learning_rate": 1.2276624905450856e-05, "loss": 0.001, "num_tokens": 1396349021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4845950328582402, "frac_reward_zero_std": 1.0, "grad_norm": 4.3062530346838197e-19, "kl": 0.02288818359375, "learning_rate": 1.2270823070371349e-05, "loss": 0.0009, "num_tokens": 1396915373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4847657250149356, "frac_reward_zero_std": 1.0, "grad_norm": 5.518482885794553e-19, "kl": 0.025665283203125, "learning_rate": 1.2265020429231627e-05, "loss": 0.001, "num_tokens": 1397479501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48493641717163094, "frac_reward_zero_std": 1.0, "grad_norm": 4.89902564114981e-19, "kl": 0.023468017578125, "learning_rate": 1.2259216984091411e-05, "loss": 0.0009, "num_tokens": 1398041357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48510710932832635, "frac_reward_zero_std": 1.0, "grad_norm": 4.419033436369897e-19, "kl": 0.023773193359375, "learning_rate": 1.2253412737010723e-05, "loss": 0.001, "num_tokens": 1398600013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48527780148502175, "frac_reward_zero_std": 1.0, "grad_norm": 4.0325142569203143e-19, "kl": 0.022369384765625, "learning_rate": 1.2247607690049855e-05, "loss": 0.0009, "num_tokens": 1399167213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48544849364171716, "frac_reward_zero_std": 1.0, "grad_norm": 4.422012146580511e-19, "kl": 0.02252197265625, "learning_rate": 1.2241801845269392e-05, "loss": 0.0009, "num_tokens": 1399731005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48561918579841257, "frac_reward_zero_std": 1.0, "grad_norm": 3.880364139761058e-19, "kl": 0.023284912109375, "learning_rate": 1.2235995204730195e-05, "loss": 0.0009, "num_tokens": 1400290637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.485789877955108, "frac_reward_zero_std": 1.0, "grad_norm": 4.159841730459079e-19, "kl": 0.023162841796875, "learning_rate": 1.223018777049342e-05, "loss": 0.0009, "num_tokens": 1400857165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4859605701118034, "frac_reward_zero_std": 1.0, "grad_norm": 4.3302849387158747e-19, "kl": 0.02313232421875, "learning_rate": 1.2224379544620486e-05, "loss": 0.0009, "num_tokens": 1401422813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4861312622684988, "frac_reward_zero_std": 1.0, "grad_norm": 4.22776894658353e-19, "kl": 0.022430419921875, "learning_rate": 1.2218570529173115e-05, "loss": 0.0009, "num_tokens": 1401986749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48630195442519414, "frac_reward_zero_std": 1.0, "grad_norm": 4.14027440551942e-19, "kl": 0.02191162109375, "learning_rate": 1.2212760726213289e-05, "loss": 0.0009, "num_tokens": 1402548221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48647264658188955, "frac_reward_zero_std": 1.0, "grad_norm": 4.606968147954154e-19, "kl": 0.024871826171875, "learning_rate": 1.2206950137803282e-05, "loss": 0.001, "num_tokens": 1403114413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48664333873858495, "frac_reward_zero_std": 1.0, "grad_norm": 4.0381213942906587e-19, "kl": 0.023193359375, "learning_rate": 1.220113876600564e-05, "loss": 0.0009, "num_tokens": 1403680749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48681403089528036, "frac_reward_zero_std": 1.0, "grad_norm": 3.9801988937622732e-19, "kl": 0.02227783203125, "learning_rate": 1.2195326612883196e-05, "loss": 0.0009, "num_tokens": 1404243245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48698472305197577, "frac_reward_zero_std": 1.0, "grad_norm": 4.80883747798617e-19, "kl": 0.02301025390625, "learning_rate": 1.218951368049905e-05, "loss": 0.0009, "num_tokens": 1404812637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48715541520867117, "frac_reward_zero_std": 1.0, "grad_norm": 4.429481358990045e-19, "kl": 0.0233154296875, "learning_rate": 1.2183699970916583e-05, "loss": 0.0009, "num_tokens": 1405377261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4873261073653666, "frac_reward_zero_std": 1.0, "grad_norm": 4.586621951086866e-19, "kl": 0.024017333984375, "learning_rate": 1.2177885486199454e-05, "loss": 0.001, "num_tokens": 1405942637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.487496799522062, "frac_reward_zero_std": 1.0, "grad_norm": 4.182414575551508e-19, "kl": 0.022796630859375, "learning_rate": 1.2172070228411597e-05, "loss": 0.0009, "num_tokens": 1406508557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48766749167875734, "frac_reward_zero_std": 1.0, "grad_norm": 4.097894640965691e-19, "kl": 0.0234375, "learning_rate": 1.2166254199617214e-05, "loss": 0.0009, "num_tokens": 1407073197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48783818383545274, "frac_reward_zero_std": 1.0, "grad_norm": 4.0255062585713056e-19, "kl": 0.023101806640625, "learning_rate": 1.2160437401880786e-05, "loss": 0.0009, "num_tokens": 1407644285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48800887599214815, "frac_reward_zero_std": 1.0, "grad_norm": 4.070550390021938e-19, "kl": 0.02374267578125, "learning_rate": 1.215461983726707e-05, "loss": 0.0009, "num_tokens": 1408213117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48817956814884356, "frac_reward_zero_std": 1.0, "grad_norm": 4.622572818121283e-19, "kl": 0.023284912109375, "learning_rate": 1.2148801507841088e-05, "loss": 0.0009, "num_tokens": 1408778637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48835026030553896, "frac_reward_zero_std": 1.0, "grad_norm": 4.0087001166468016e-19, "kl": 0.02264404296875, "learning_rate": 1.2142982415668139e-05, "loss": 0.0009, "num_tokens": 1409340461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48852095246223437, "frac_reward_zero_std": 1.0, "grad_norm": 4.513591604288023e-19, "kl": 0.0235595703125, "learning_rate": 1.2137162562813789e-05, "loss": 0.0009, "num_tokens": 1409907725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4886916446189298, "frac_reward_zero_std": 1.0, "grad_norm": 4.179204748805195e-19, "kl": 0.023040771484375, "learning_rate": 1.2131341951343876e-05, "loss": 0.0009, "num_tokens": 1410477149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4888623367756252, "frac_reward_zero_std": 1.0, "grad_norm": 4.219848039630424e-19, "kl": 0.023193359375, "learning_rate": 1.2125520583324508e-05, "loss": 0.0009, "num_tokens": 1411046909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48903302893232053, "frac_reward_zero_std": 1.0, "grad_norm": 4.4502173759604445e-19, "kl": 0.02349853515625, "learning_rate": 1.211969846082206e-05, "loss": 0.0009, "num_tokens": 1411609645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48920372108901594, "frac_reward_zero_std": 1.0, "grad_norm": 4.18148233134941e-19, "kl": 0.022979736328125, "learning_rate": 1.2113875585903173e-05, "loss": 0.0009, "num_tokens": 1412175421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48937441324571135, "frac_reward_zero_std": 1.0, "grad_norm": 4.955128554324421e-19, "kl": 0.02545166015625, "learning_rate": 1.2108051960634761e-05, "loss": 0.001, "num_tokens": 1412737917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48954510540240675, "frac_reward_zero_std": 1.0, "grad_norm": 4.536857817208534e-19, "kl": 0.023895263671875, "learning_rate": 1.2102227587084e-05, "loss": 0.001, "num_tokens": 1413306093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48971579755910216, "frac_reward_zero_std": 1.0, "grad_norm": 4.348122066422077e-19, "kl": 0.022674560546875, "learning_rate": 1.2096402467318332e-05, "loss": 0.0009, "num_tokens": 1413869437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48988648971579757, "frac_reward_zero_std": 1.0, "grad_norm": 4.751144742733294e-19, "kl": 0.0244140625, "learning_rate": 1.2090576603405462e-05, "loss": 0.001, "num_tokens": 1414436205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.490057181872493, "frac_reward_zero_std": 1.0, "grad_norm": 4.216570013334209e-19, "kl": 0.023101806640625, "learning_rate": 1.2084749997413363e-05, "loss": 0.0009, "num_tokens": 1415007421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4902278740291884, "frac_reward_zero_std": 1.0, "grad_norm": 3.9598681426431663e-19, "kl": 0.022735595703125, "learning_rate": 1.2078922651410272e-05, "loss": 0.0009, "num_tokens": 1415587181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49039856618588373, "frac_reward_zero_std": 1.0, "grad_norm": 4.751282721578197e-19, "kl": 0.0242919921875, "learning_rate": 1.2073094567464683e-05, "loss": 0.001, "num_tokens": 1416151437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49056925834257914, "frac_reward_zero_std": 1.0, "grad_norm": 4.454398836529524e-19, "kl": 0.023468017578125, "learning_rate": 1.206726574764536e-05, "loss": 0.0009, "num_tokens": 1416720893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49073995049927455, "frac_reward_zero_std": 1.0, "grad_norm": 4.1539388929125537e-19, "kl": 0.0230712890625, "learning_rate": 1.2061436194021317e-05, "loss": 0.0009, "num_tokens": 1417285133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49091064265596995, "frac_reward_zero_std": 1.0, "grad_norm": 4.233181625481781e-19, "kl": 0.023712158203125, "learning_rate": 1.205560590866184e-05, "loss": 0.0009, "num_tokens": 1417851901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49108133481266536, "frac_reward_zero_std": 1.0, "grad_norm": 4.417999166768942e-19, "kl": 0.024658203125, "learning_rate": 1.2049774893636468e-05, "loss": 0.001, "num_tokens": 1418415453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49125202696936077, "frac_reward_zero_std": 1.0, "grad_norm": 4.0073999646102273e-19, "kl": 0.022247314453125, "learning_rate": 1.2043943151015003e-05, "loss": 0.0009, "num_tokens": 1418989197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4914227191260562, "frac_reward_zero_std": 1.0, "grad_norm": 3.914133893680121e-19, "kl": 0.02276611328125, "learning_rate": 1.20381106828675e-05, "loss": 0.0009, "num_tokens": 1419558317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4915934112827516, "frac_reward_zero_std": 1.0, "grad_norm": 4.40079066363239e-19, "kl": 0.024200439453125, "learning_rate": 1.2032277491264277e-05, "loss": 0.001, "num_tokens": 1420126157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49176410343944693, "frac_reward_zero_std": 1.0, "grad_norm": 4.343582129684455e-19, "kl": 0.023162841796875, "learning_rate": 1.2026443578275903e-05, "loss": 0.0009, "num_tokens": 1420702509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49193479559614234, "frac_reward_zero_std": 1.0, "grad_norm": 4.2821109396577925e-19, "kl": 0.02294921875, "learning_rate": 1.2020608945973216e-05, "loss": 0.0009, "num_tokens": 1421269101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49210548775283774, "frac_reward_zero_std": 1.0, "grad_norm": 4.2269360434466674e-19, "kl": 0.023101806640625, "learning_rate": 1.2014773596427284e-05, "loss": 0.0009, "num_tokens": 1421834717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49227617990953315, "frac_reward_zero_std": 1.0, "grad_norm": 4.641098046949255e-19, "kl": 0.024810791015625, "learning_rate": 1.200893753170946e-05, "loss": 0.001, "num_tokens": 1422401901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49244687206622856, "frac_reward_zero_std": 1.0, "grad_norm": 4.356797544991917e-19, "kl": 0.023040771484375, "learning_rate": 1.2003100753891328e-05, "loss": 0.0009, "num_tokens": 1422969021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49261756422292396, "frac_reward_zero_std": 1.0, "grad_norm": 4.360011957210801e-19, "kl": 0.02337646484375, "learning_rate": 1.1997263265044739e-05, "loss": 0.0009, "num_tokens": 1423528909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49278825637961937, "frac_reward_zero_std": 1.0, "grad_norm": 4.552961518946697e-19, "kl": 0.02386474609375, "learning_rate": 1.1991425067241787e-05, "loss": 0.001, "num_tokens": 1424091741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4929589485363148, "frac_reward_zero_std": 1.0, "grad_norm": 4.446282613459027e-19, "kl": 0.024200439453125, "learning_rate": 1.198558616255482e-05, "loss": 0.001, "num_tokens": 1424655533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49312964069301013, "frac_reward_zero_std": 1.0, "grad_norm": 4.232179858711282e-19, "kl": 0.0235595703125, "learning_rate": 1.1979746553056443e-05, "loss": 0.0009, "num_tokens": 1425226621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49330033284970554, "frac_reward_zero_std": 1.0, "grad_norm": 4.243437527288387e-19, "kl": 0.02276611328125, "learning_rate": 1.1973906240819506e-05, "loss": 0.0009, "num_tokens": 1425794365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49347102500640094, "frac_reward_zero_std": 1.0, "grad_norm": 4.3338029724355207e-19, "kl": 0.023651123046875, "learning_rate": 1.1968065227917105e-05, "loss": 0.0009, "num_tokens": 1426358045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49364171716309635, "frac_reward_zero_std": 1.0, "grad_norm": 4.505912587858242e-19, "kl": 0.023040771484375, "learning_rate": 1.1962223516422593e-05, "loss": 0.0009, "num_tokens": 1426922749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49381240931979176, "frac_reward_zero_std": 1.0, "grad_norm": 4.656449032573643e-19, "kl": 0.024444580078125, "learning_rate": 1.1956381108409563e-05, "loss": 0.001, "num_tokens": 1427485261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49398310147648716, "frac_reward_zero_std": 1.0, "grad_norm": 4.0991503730331184e-19, "kl": 0.02276611328125, "learning_rate": 1.1950538005951862e-05, "loss": 0.0009, "num_tokens": 1428052397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49415379363318257, "frac_reward_zero_std": 1.0, "grad_norm": 4.3114404005526835e-19, "kl": 0.022705078125, "learning_rate": 1.1944694211123578e-05, "loss": 0.0009, "num_tokens": 1428623485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.494324485789878, "frac_reward_zero_std": 1.0, "grad_norm": 4.787380190196686e-19, "kl": 0.0242919921875, "learning_rate": 1.1938849725999048e-05, "loss": 0.001, "num_tokens": 1429189837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4944951779465733, "frac_reward_zero_std": 1.0, "grad_norm": 4.646422370462977e-19, "kl": 0.024322509765625, "learning_rate": 1.1933004552652859e-05, "loss": 0.001, "num_tokens": 1429753789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49466587010326873, "frac_reward_zero_std": 1.0, "grad_norm": 4.64849790973271e-19, "kl": 0.023681640625, "learning_rate": 1.1927158693159826e-05, "loss": 0.0009, "num_tokens": 1430315597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49483656225996414, "frac_reward_zero_std": 1.0, "grad_norm": 4.0505855028907594e-19, "kl": 0.023101806640625, "learning_rate": 1.1921312149595024e-05, "loss": 0.0009, "num_tokens": 1430887357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49500725441665955, "frac_reward_zero_std": 1.0, "grad_norm": 4.636773071408829e-19, "kl": 0.02362060546875, "learning_rate": 1.1915464924033766e-05, "loss": 0.0009, "num_tokens": 1431447997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49517794657335495, "frac_reward_zero_std": 1.0, "grad_norm": 4.2950306422404314e-19, "kl": 0.02362060546875, "learning_rate": 1.19096170185516e-05, "loss": 0.0009, "num_tokens": 1432016541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49534863873005036, "frac_reward_zero_std": 1.0, "grad_norm": 4.2372434374210346e-19, "kl": 0.023345947265625, "learning_rate": 1.1903768435224333e-05, "loss": 0.0009, "num_tokens": 1432589117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49551933088674577, "frac_reward_zero_std": 1.0, "grad_norm": 3.8531928020451577e-19, "kl": 0.0240478515625, "learning_rate": 1.1897919176127986e-05, "loss": 0.001, "num_tokens": 1433157229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4956900230434412, "frac_reward_zero_std": 1.0, "grad_norm": 4.0422054806115555e-19, "kl": 0.023468017578125, "learning_rate": 1.1892069243338848e-05, "loss": 0.0009, "num_tokens": 1433723341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4958607152001366, "frac_reward_zero_std": 1.0, "grad_norm": 4.329402645649049e-19, "kl": 0.023406982421875, "learning_rate": 1.1886218638933424e-05, "loss": 0.0009, "num_tokens": 1434302413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49603140735683193, "frac_reward_zero_std": 1.0, "grad_norm": 4.167554366728655e-19, "kl": 0.02276611328125, "learning_rate": 1.1880367364988476e-05, "loss": 0.0009, "num_tokens": 1434867133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49620209951352734, "frac_reward_zero_std": 1.0, "grad_norm": 4.370267800669636e-19, "kl": 0.023651123046875, "learning_rate": 1.1874515423580988e-05, "loss": 0.0009, "num_tokens": 1435433517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49637279167022275, "frac_reward_zero_std": 1.0, "grad_norm": 4.393415768506061e-19, "kl": 0.023529052734375, "learning_rate": 1.1868662816788192e-05, "loss": 0.0009, "num_tokens": 1435998285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49654348382691815, "frac_reward_zero_std": 1.0, "grad_norm": 4.134456483267659e-19, "kl": 0.02301025390625, "learning_rate": 1.1862809546687548e-05, "loss": 0.0009, "num_tokens": 1436564093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49671417598361356, "frac_reward_zero_std": 1.0, "grad_norm": 4.494108255051579e-19, "kl": 0.02349853515625, "learning_rate": 1.1856955615356765e-05, "loss": 0.0009, "num_tokens": 1437127565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49688486814030897, "frac_reward_zero_std": 1.0, "grad_norm": 4.336903203054669e-19, "kl": 0.022857666015625, "learning_rate": 1.1851101024873763e-05, "loss": 0.0009, "num_tokens": 1437691357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4970555602970044, "frac_reward_zero_std": 1.0, "grad_norm": 4.413561246321773e-19, "kl": 0.023345947265625, "learning_rate": 1.1845245777316724e-05, "loss": 0.0009, "num_tokens": 1438257869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4972262524536998, "frac_reward_zero_std": 1.0, "grad_norm": 4.224917192507489e-19, "kl": 0.022857666015625, "learning_rate": 1.183938987476404e-05, "loss": 0.0009, "num_tokens": 1438831245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49739694461039513, "frac_reward_zero_std": 1.0, "grad_norm": 4.578093082481175e-19, "kl": 0.0238037109375, "learning_rate": 1.183353331929435e-05, "loss": 0.001, "num_tokens": 1439400365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49756763676709054, "frac_reward_zero_std": 1.0, "grad_norm": 4.112580541042649e-19, "kl": 0.0225830078125, "learning_rate": 1.1827676112986519e-05, "loss": 0.0009, "num_tokens": 1439972845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49773832892378594, "frac_reward_zero_std": 1.0, "grad_norm": 4.433396356747105e-19, "kl": 0.0235595703125, "learning_rate": 1.1821818257919642e-05, "loss": 0.0009, "num_tokens": 1440544317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49790902108048135, "frac_reward_zero_std": 1.0, "grad_norm": 4.3721313377190795e-19, "kl": 0.023468017578125, "learning_rate": 1.1815959756173047e-05, "loss": 0.0009, "num_tokens": 1441113677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49807971323717676, "frac_reward_zero_std": 1.0, "grad_norm": 4.481134169097401e-19, "kl": 0.02349853515625, "learning_rate": 1.1810100609826294e-05, "loss": 0.0009, "num_tokens": 1441682653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49825040539387216, "frac_reward_zero_std": 1.0, "grad_norm": 4.2195192473160082e-19, "kl": 0.023223876953125, "learning_rate": 1.1804240820959163e-05, "loss": 0.0009, "num_tokens": 1442247757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49842109755056757, "frac_reward_zero_std": 1.0, "grad_norm": 4.603504968681237e-19, "kl": 0.0234375, "learning_rate": 1.1798380391651669e-05, "loss": 0.0009, "num_tokens": 1442809549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.498591789707263, "frac_reward_zero_std": 1.0, "grad_norm": 4.375631345383949e-19, "kl": 0.0234375, "learning_rate": 1.1792519323984056e-05, "loss": 0.0009, "num_tokens": 1443372749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49876248186395833, "frac_reward_zero_std": 1.0, "grad_norm": 5.010148193515215e-19, "kl": 0.023956298828125, "learning_rate": 1.1786657620036788e-05, "loss": 0.001, "num_tokens": 1443937949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49893317402065374, "frac_reward_zero_std": 1.0, "grad_norm": 4.014277689268951e-19, "kl": 0.0218505859375, "learning_rate": 1.178079528189056e-05, "loss": 0.0009, "num_tokens": 1444511645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49910386617734914, "frac_reward_zero_std": 1.0, "grad_norm": 4.617369851705714e-19, "kl": 0.0240478515625, "learning_rate": 1.177493231162629e-05, "loss": 0.001, "num_tokens": 1445078765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49927455833404455, "frac_reward_zero_std": 1.0, "grad_norm": 4.119061180776723e-19, "kl": 0.02362060546875, "learning_rate": 1.176906871132512e-05, "loss": 0.0009, "num_tokens": 1445649069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49944525049073996, "frac_reward_zero_std": 1.0, "grad_norm": 4.891698035880938e-19, "kl": 0.0240478515625, "learning_rate": 1.1763204483068418e-05, "loss": 0.001, "num_tokens": 1446210813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49961594264743536, "frac_reward_zero_std": 1.0, "grad_norm": 4.0252971743236406e-19, "kl": 0.02264404296875, "learning_rate": 1.1757339628937772e-05, "loss": 0.0009, "num_tokens": 1446779501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49978663480413077, "frac_reward_zero_std": 1.0, "grad_norm": 4.40055866206161e-19, "kl": 0.022308349609375, "learning_rate": 1.1751474151014995e-05, "loss": 0.0009, "num_tokens": 1447343421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4999573269608262, "frac_reward_zero_std": 1.0, "grad_norm": 4.589852514973728e-19, "kl": 0.023895263671875, "learning_rate": 1.1745608051382118e-05, "loss": 0.001, "num_tokens": 1447906477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5001280191175216, "frac_reward_zero_std": 1.0, "grad_norm": 3.8002666803152287e-19, "kl": 0.022491455078125, "learning_rate": 1.1739741332121394e-05, "loss": 0.0009, "num_tokens": 1448478589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.500298711274217, "frac_reward_zero_std": 1.0, "grad_norm": 4.441478649884911e-19, "kl": 0.02435302734375, "learning_rate": 1.1733873995315301e-05, "loss": 0.001, "num_tokens": 1449043997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5004694034309124, "frac_reward_zero_std": 1.0, "grad_norm": 3.936452035987114e-19, "kl": 0.022735595703125, "learning_rate": 1.1728006043046528e-05, "loss": 0.0009, "num_tokens": 1449609101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5006400955876078, "frac_reward_zero_std": 1.0, "grad_norm": 4.2396466844909053e-19, "kl": 0.02294921875, "learning_rate": 1.1722137477397983e-05, "loss": 0.0009, "num_tokens": 1450174877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5008107877443031, "frac_reward_zero_std": 1.0, "grad_norm": 4.763030308127398e-19, "kl": 0.0240478515625, "learning_rate": 1.1716268300452807e-05, "loss": 0.001, "num_tokens": 1450754029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5009814799009985, "frac_reward_zero_std": 1.0, "grad_norm": 4.587892952070165e-19, "kl": 0.02410888671875, "learning_rate": 1.1710398514294333e-05, "loss": 0.001, "num_tokens": 1451316173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5011521720576939, "frac_reward_zero_std": 1.0, "grad_norm": 4.388792075149512e-19, "kl": 0.023193359375, "learning_rate": 1.1704528121006132e-05, "loss": 0.0009, "num_tokens": 1451880381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5013228642143893, "frac_reward_zero_std": 1.0, "grad_norm": 4.359200766237122e-19, "kl": 0.02294921875, "learning_rate": 1.1698657122671973e-05, "loss": 0.0009, "num_tokens": 1452442493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5014935563710847, "frac_reward_zero_std": 1.0, "grad_norm": 4.480841441193226e-19, "kl": 0.023956298828125, "learning_rate": 1.1692785521375859e-05, "loss": 0.001, "num_tokens": 1453003437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5016642485277801, "frac_reward_zero_std": 1.0, "grad_norm": 4.749790932426532e-19, "kl": 0.024749755859375, "learning_rate": 1.1686913319201985e-05, "loss": 0.001, "num_tokens": 1453563741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5018349406844755, "frac_reward_zero_std": 1.0, "grad_norm": 4.532164709848335e-19, "kl": 0.0238037109375, "learning_rate": 1.1681040518234781e-05, "loss": 0.001, "num_tokens": 1454127789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.502005632841171, "frac_reward_zero_std": 1.0, "grad_norm": 4.311205189829528e-19, "kl": 0.022613525390625, "learning_rate": 1.167516712055887e-05, "loss": 0.0009, "num_tokens": 1454694077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5021763249978664, "frac_reward_zero_std": 1.0, "grad_norm": 4.357712647246399e-19, "kl": 0.023406982421875, "learning_rate": 1.1669293128259107e-05, "loss": 0.0009, "num_tokens": 1455263741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5023470171545618, "frac_reward_zero_std": 1.0, "grad_norm": 4.219485673642362e-19, "kl": 0.0225830078125, "learning_rate": 1.166341854342054e-05, "loss": 0.0009, "num_tokens": 1455832605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5025177093112572, "frac_reward_zero_std": 1.0, "grad_norm": 4.2742151732160277e-19, "kl": 0.0233154296875, "learning_rate": 1.1657543368128434e-05, "loss": 0.0009, "num_tokens": 1456393949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5026884014679526, "frac_reward_zero_std": 1.0, "grad_norm": 4.5024939503528415e-19, "kl": 0.02392578125, "learning_rate": 1.1651667604468268e-05, "loss": 0.001, "num_tokens": 1456959837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.502859093624648, "frac_reward_zero_std": 1.0, "grad_norm": 4.480467380906601e-19, "kl": 0.024566650390625, "learning_rate": 1.1645791254525723e-05, "loss": 0.001, "num_tokens": 1457521501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5030297857813434, "frac_reward_zero_std": 1.0, "grad_norm": 4.05071656958704e-19, "kl": 0.0228271484375, "learning_rate": 1.1639914320386694e-05, "loss": 0.0009, "num_tokens": 1458093133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5032004779380388, "frac_reward_zero_std": 1.0, "grad_norm": 4.3029124941350644e-19, "kl": 0.022796630859375, "learning_rate": 1.1634036804137279e-05, "loss": 0.0009, "num_tokens": 1458660493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5033711700947342, "frac_reward_zero_std": 1.0, "grad_norm": 4.112028244625483e-19, "kl": 0.0233154296875, "learning_rate": 1.162815870786378e-05, "loss": 0.0009, "num_tokens": 1459233133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5035418622514295, "frac_reward_zero_std": 1.0, "grad_norm": 4.068088660399962e-19, "kl": 0.0228271484375, "learning_rate": 1.1622280033652718e-05, "loss": 0.0009, "num_tokens": 1459802941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5037125544081249, "frac_reward_zero_std": 1.0, "grad_norm": 3.7966536655494014e-19, "kl": 0.021728515625, "learning_rate": 1.1616400783590802e-05, "loss": 0.0009, "num_tokens": 1460368621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5038832465648203, "frac_reward_zero_std": 1.0, "grad_norm": 4.426880619083906e-19, "kl": 0.023681640625, "learning_rate": 1.1610520959764957e-05, "loss": 0.0009, "num_tokens": 1460937165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5040539387215157, "frac_reward_zero_std": 1.0, "grad_norm": 4.678010640322331e-19, "kl": 0.02520751953125, "learning_rate": 1.160464056426231e-05, "loss": 0.001, "num_tokens": 1461501757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5042246308782111, "frac_reward_zero_std": 1.0, "grad_norm": 4.100195210143883e-19, "kl": 0.023101806640625, "learning_rate": 1.1598759599170184e-05, "loss": 0.0009, "num_tokens": 1462068061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5043953230349065, "frac_reward_zero_std": 1.0, "grad_norm": 4.686009518109568e-19, "kl": 0.023193359375, "learning_rate": 1.1592878066576112e-05, "loss": 0.0009, "num_tokens": 1462634509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5045660151916019, "frac_reward_zero_std": 1.0, "grad_norm": 4.1523358979698053e-19, "kl": 0.023193359375, "learning_rate": 1.1586995968567827e-05, "loss": 0.0009, "num_tokens": 1463198317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5047367073482973, "frac_reward_zero_std": 1.0, "grad_norm": 4.804962919447388e-19, "kl": 0.024444580078125, "learning_rate": 1.158111330723326e-05, "loss": 0.001, "num_tokens": 1463765437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5049073995049927, "frac_reward_zero_std": 1.0, "grad_norm": 3.954779250379019e-19, "kl": 0.022705078125, "learning_rate": 1.1575230084660544e-05, "loss": 0.0009, "num_tokens": 1464330509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5050780916616882, "frac_reward_zero_std": 1.0, "grad_norm": 4.484691000939406e-19, "kl": 0.0234375, "learning_rate": 1.1569346302938012e-05, "loss": 0.0009, "num_tokens": 1464896573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5052487838183836, "frac_reward_zero_std": 1.0, "grad_norm": 4.402481516999033e-19, "kl": 0.02362060546875, "learning_rate": 1.1563461964154192e-05, "loss": 0.0009, "num_tokens": 1465462973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.505419475975079, "frac_reward_zero_std": 1.0, "grad_norm": 4.013848858931964e-19, "kl": 0.02215576171875, "learning_rate": 1.1557577070397813e-05, "loss": 0.0009, "num_tokens": 1466027965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5055901681317744, "frac_reward_zero_std": 1.0, "grad_norm": 4.0991481362933563e-19, "kl": 0.022979736328125, "learning_rate": 1.1551691623757797e-05, "loss": 0.0009, "num_tokens": 1466591757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5057608602884698, "frac_reward_zero_std": 1.0, "grad_norm": 4.403333173314174e-19, "kl": 0.023162841796875, "learning_rate": 1.1545805626323265e-05, "loss": 0.0009, "num_tokens": 1467160173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5059315524451652, "frac_reward_zero_std": 1.0, "grad_norm": 4.0954225468041647e-19, "kl": 0.0233154296875, "learning_rate": 1.1539919080183544e-05, "loss": 0.0009, "num_tokens": 1467719693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5061022446018606, "frac_reward_zero_std": 1.0, "grad_norm": 4.190005507317845e-19, "kl": 0.02288818359375, "learning_rate": 1.153403198742813e-05, "loss": 0.0009, "num_tokens": 1468289629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5062729367585559, "frac_reward_zero_std": 1.0, "grad_norm": 4.1214924844390767e-19, "kl": 0.022796630859375, "learning_rate": 1.152814435014674e-05, "loss": 0.0009, "num_tokens": 1468856045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5064436289152513, "frac_reward_zero_std": 1.0, "grad_norm": 3.7804928137698937e-19, "kl": 0.022491455078125, "learning_rate": 1.1522256170429268e-05, "loss": 0.0009, "num_tokens": 1469419965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5066143210719467, "frac_reward_zero_std": 1.0, "grad_norm": 4.524849988700789e-19, "kl": 0.022674560546875, "learning_rate": 1.1516367450365804e-05, "loss": 0.0009, "num_tokens": 1469984861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5067850132286421, "frac_reward_zero_std": 1.0, "grad_norm": 4.049800978624298e-19, "kl": 0.0228271484375, "learning_rate": 1.1510478192046632e-05, "loss": 0.0009, "num_tokens": 1470548173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5069557053853375, "frac_reward_zero_std": 1.0, "grad_norm": 4.498670665911271e-19, "kl": 0.024444580078125, "learning_rate": 1.1504588397562233e-05, "loss": 0.001, "num_tokens": 1471118493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5071263975420329, "frac_reward_zero_std": 1.0, "grad_norm": 4.05692611804762e-19, "kl": 0.02313232421875, "learning_rate": 1.1498698069003258e-05, "loss": 0.0009, "num_tokens": 1471689453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5072970896987283, "frac_reward_zero_std": 1.0, "grad_norm": 4.1535465596114865e-19, "kl": 0.022796630859375, "learning_rate": 1.1492807208460575e-05, "loss": 0.0009, "num_tokens": 1472261037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5074677818554237, "frac_reward_zero_std": 1.0, "grad_norm": 3.9340522963644103e-19, "kl": 0.023040771484375, "learning_rate": 1.1486915818025214e-05, "loss": 0.0009, "num_tokens": 1472825341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5076384740121191, "frac_reward_zero_std": 1.0, "grad_norm": 4.559645936412434e-19, "kl": 0.023406982421875, "learning_rate": 1.1481023899788414e-05, "loss": 0.0009, "num_tokens": 1473389917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5078091661688146, "frac_reward_zero_std": 1.0, "grad_norm": 4.429393570058929e-19, "kl": 0.0238037109375, "learning_rate": 1.1475131455841595e-05, "loss": 0.001, "num_tokens": 1473956317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.50797985832551, "frac_reward_zero_std": 1.0, "grad_norm": 3.954066099592238e-19, "kl": 0.022796630859375, "learning_rate": 1.1469238488276355e-05, "loss": 0.0009, "num_tokens": 1474522765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5081505504822054, "frac_reward_zero_std": 1.0, "grad_norm": 4.418585188709574e-19, "kl": 0.02386474609375, "learning_rate": 1.1463344999184489e-05, "loss": 0.001, "num_tokens": 1475087613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5083212426389008, "frac_reward_zero_std": 1.0, "grad_norm": 5.030675738628581e-19, "kl": 0.02435302734375, "learning_rate": 1.1457450990657973e-05, "loss": 0.001, "num_tokens": 1475655389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5084919347955962, "frac_reward_zero_std": 1.0, "grad_norm": 4.398242930655076e-19, "kl": 0.023651123046875, "learning_rate": 1.1451556464788964e-05, "loss": 0.0009, "num_tokens": 1476219117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5086626269522916, "frac_reward_zero_std": 1.0, "grad_norm": 3.672816033111777e-19, "kl": 0.022125244140625, "learning_rate": 1.1445661423669812e-05, "loss": 0.0009, "num_tokens": 1476785757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.508833319108987, "frac_reward_zero_std": 1.0, "grad_norm": 4.582011732061947e-19, "kl": 0.0245361328125, "learning_rate": 1.1439765869393036e-05, "loss": 0.001, "num_tokens": 1477349325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5090040112656823, "frac_reward_zero_std": 1.0, "grad_norm": 3.8344804275426684e-19, "kl": 0.0224609375, "learning_rate": 1.1433869804051349e-05, "loss": 0.0009, "num_tokens": 1477912605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5091747034223777, "frac_reward_zero_std": 1.0, "grad_norm": 4.025073146264692e-19, "kl": 0.02264404296875, "learning_rate": 1.1427973229737645e-05, "loss": 0.0009, "num_tokens": 1478477821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5093453955790731, "frac_reward_zero_std": 1.0, "grad_norm": 4.354507889691706e-19, "kl": 0.022857666015625, "learning_rate": 1.142207614854499e-05, "loss": 0.0009, "num_tokens": 1479038269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5095160877357685, "frac_reward_zero_std": 1.0, "grad_norm": 4.001114555638614e-19, "kl": 0.02398681640625, "learning_rate": 1.1416178562566633e-05, "loss": 0.001, "num_tokens": 1479604701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5096867798924639, "frac_reward_zero_std": 1.0, "grad_norm": 4.037644751059466e-19, "kl": 0.02215576171875, "learning_rate": 1.1410280473896011e-05, "loss": 0.0009, "num_tokens": 1480167037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5098574720491593, "frac_reward_zero_std": 1.0, "grad_norm": 4.439724774112186e-19, "kl": 0.023223876953125, "learning_rate": 1.1404381884626728e-05, "loss": 0.0009, "num_tokens": 1480727965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5100281642058547, "frac_reward_zero_std": 1.0, "grad_norm": 4.3096636672672593e-19, "kl": 0.022857666015625, "learning_rate": 1.1398482796852571e-05, "loss": 0.0009, "num_tokens": 1481287917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5101988563625501, "frac_reward_zero_std": 1.0, "grad_norm": 4.221359462154391e-19, "kl": 0.02294921875, "learning_rate": 1.1392583212667503e-05, "loss": 0.0009, "num_tokens": 1481857053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5103695485192455, "frac_reward_zero_std": 1.0, "grad_norm": 4.683725366204152e-19, "kl": 0.023651123046875, "learning_rate": 1.1386683134165666e-05, "loss": 0.0009, "num_tokens": 1482423469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.510540240675941, "frac_reward_zero_std": 1.0, "grad_norm": 4.35434298071817e-19, "kl": 0.0228271484375, "learning_rate": 1.138078256344137e-05, "loss": 0.0009, "num_tokens": 1482987261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5107109328326364, "frac_reward_zero_std": 1.0, "grad_norm": 4.584198547312645e-19, "kl": 0.02325439453125, "learning_rate": 1.1374881502589108e-05, "loss": 0.0009, "num_tokens": 1483545837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5108816249893318, "frac_reward_zero_std": 1.0, "grad_norm": 4.047278142966089e-19, "kl": 0.022796630859375, "learning_rate": 1.136897995370354e-05, "loss": 0.0009, "num_tokens": 1484108461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5110523171460272, "frac_reward_zero_std": 1.0, "grad_norm": 4.594805571138328e-19, "kl": 0.023040771484375, "learning_rate": 1.1363077918879512e-05, "loss": 0.0009, "num_tokens": 1484676749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5112230093027226, "frac_reward_zero_std": 1.0, "grad_norm": 4.014581252739832e-19, "kl": 0.02313232421875, "learning_rate": 1.1357175400212024e-05, "loss": 0.0009, "num_tokens": 1485244525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.511393701459418, "frac_reward_zero_std": 1.0, "grad_norm": 4.509512232707043e-19, "kl": 0.023773193359375, "learning_rate": 1.1351272399796262e-05, "loss": 0.001, "num_tokens": 1485815645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5115643936161134, "frac_reward_zero_std": 1.0, "grad_norm": 4.079544036366462e-19, "kl": 0.022735595703125, "learning_rate": 1.1345368919727574e-05, "loss": 0.0009, "num_tokens": 1486383325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5117350857728087, "frac_reward_zero_std": 1.0, "grad_norm": 4.3105243909887837e-19, "kl": 0.0224609375, "learning_rate": 1.1339464962101486e-05, "loss": 0.0009, "num_tokens": 1486947677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5119057779295041, "frac_reward_zero_std": 1.0, "grad_norm": 4.240453981741516e-19, "kl": 0.023345947265625, "learning_rate": 1.1333560529013685e-05, "loss": 0.0009, "num_tokens": 1487511389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5120764700861995, "frac_reward_zero_std": 1.0, "grad_norm": 4.2716090775108125e-19, "kl": 0.02276611328125, "learning_rate": 1.1327655622560038e-05, "loss": 0.0009, "num_tokens": 1488077949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5122471622428949, "frac_reward_zero_std": 1.0, "grad_norm": 4.540406045919554e-19, "kl": 0.024078369140625, "learning_rate": 1.132175024483657e-05, "loss": 0.001, "num_tokens": 1488647085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5124178543995903, "frac_reward_zero_std": 1.0, "grad_norm": 4.1276794421372798e-19, "kl": 0.022735595703125, "learning_rate": 1.131584439793948e-05, "loss": 0.0009, "num_tokens": 1489205997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5125885465562857, "frac_reward_zero_std": 1.0, "grad_norm": 4.1684689756385184e-19, "kl": 0.023681640625, "learning_rate": 1.1309938083965121e-05, "loss": 0.0009, "num_tokens": 1489770861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5127592387129811, "frac_reward_zero_std": 1.0, "grad_norm": 4.182091402536984e-19, "kl": 0.02264404296875, "learning_rate": 1.1304031305010037e-05, "loss": 0.0009, "num_tokens": 1490334509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5129299308696765, "frac_reward_zero_std": 1.0, "grad_norm": 3.9633918786425584e-19, "kl": 0.02325439453125, "learning_rate": 1.1298124063170906e-05, "loss": 0.0009, "num_tokens": 1490899245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5131006230263719, "frac_reward_zero_std": 1.0, "grad_norm": 3.7804608747964416e-19, "kl": 0.021881103515625, "learning_rate": 1.1292216360544594e-05, "loss": 0.0009, "num_tokens": 1491462909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5132713151830673, "frac_reward_zero_std": 1.0, "grad_norm": 4.2099389875636846e-19, "kl": 0.023712158203125, "learning_rate": 1.1286308199228122e-05, "loss": 0.0009, "num_tokens": 1492027949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5134420073397628, "frac_reward_zero_std": 1.0, "grad_norm": 4.306168752082369e-19, "kl": 0.023193359375, "learning_rate": 1.1280399581318671e-05, "loss": 0.0009, "num_tokens": 1492592621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5136126994964582, "frac_reward_zero_std": 1.0, "grad_norm": 4.158555742360667e-19, "kl": 0.0224609375, "learning_rate": 1.1274490508913588e-05, "loss": 0.0009, "num_tokens": 1493159789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5137833916531536, "frac_reward_zero_std": 1.0, "grad_norm": 4.0405645949666803e-19, "kl": 0.024139404296875, "learning_rate": 1.1268580984110382e-05, "loss": 0.001, "num_tokens": 1493727245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.513954083809849, "frac_reward_zero_std": 1.0, "grad_norm": 4.2648507319236763e-19, "kl": 0.023101806640625, "learning_rate": 1.1262671009006719e-05, "loss": 0.0009, "num_tokens": 1494292493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5141247759665444, "frac_reward_zero_std": 1.0, "grad_norm": 4.1091881617520804e-19, "kl": 0.02288818359375, "learning_rate": 1.1256760585700428e-05, "loss": 0.0009, "num_tokens": 1494856621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5142954681232398, "frac_reward_zero_std": 1.0, "grad_norm": 4.364543830756736e-19, "kl": 0.0234375, "learning_rate": 1.1250849716289497e-05, "loss": 0.0009, "num_tokens": 1495418109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5144661602799352, "frac_reward_zero_std": 1.0, "grad_norm": 3.826969036829021e-19, "kl": 0.021728515625, "learning_rate": 1.1244938402872065e-05, "loss": 0.0009, "num_tokens": 1495979549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5146368524366305, "frac_reward_zero_std": 1.0, "grad_norm": 4.330482218316785e-19, "kl": 0.023895263671875, "learning_rate": 1.1239026647546442e-05, "loss": 0.001, "num_tokens": 1496543709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5148075445933259, "frac_reward_zero_std": 1.0, "grad_norm": 4.177868943003962e-19, "kl": 0.023284912109375, "learning_rate": 1.1233114452411088e-05, "loss": 0.0009, "num_tokens": 1497112845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5149782367500213, "frac_reward_zero_std": 1.0, "grad_norm": 4.0022416240830294e-19, "kl": 0.022979736328125, "learning_rate": 1.1227201819564615e-05, "loss": 0.0009, "num_tokens": 1497678781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5151489289067167, "frac_reward_zero_std": 1.0, "grad_norm": 4.643023599655151e-19, "kl": 0.02288818359375, "learning_rate": 1.1221288751105794e-05, "loss": 0.0009, "num_tokens": 1498240061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5153196210634121, "frac_reward_zero_std": 1.0, "grad_norm": 3.813438927192386e-19, "kl": 0.021728515625, "learning_rate": 1.1215375249133553e-05, "loss": 0.0009, "num_tokens": 1498803069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5154903132201075, "frac_reward_zero_std": 1.0, "grad_norm": 4.164724594078624e-19, "kl": 0.022796630859375, "learning_rate": 1.1209461315746972e-05, "loss": 0.0009, "num_tokens": 1499365645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5156610053768029, "frac_reward_zero_std": 1.0, "grad_norm": 4.379819171821819e-19, "kl": 0.022796630859375, "learning_rate": 1.1203546953045283e-05, "loss": 0.0009, "num_tokens": 1499923597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5158316975334983, "frac_reward_zero_std": 1.0, "grad_norm": 4.86570446501506e-19, "kl": 0.02435302734375, "learning_rate": 1.119763216312787e-05, "loss": 0.001, "num_tokens": 1500497997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5160023896901937, "frac_reward_zero_std": 1.0, "grad_norm": 4.453019450309843e-19, "kl": 0.02459716796875, "learning_rate": 1.119171694809427e-05, "loss": 0.001, "num_tokens": 1501060989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5161730818468891, "frac_reward_zero_std": 1.0, "grad_norm": 4.3059403841020455e-19, "kl": 0.023406982421875, "learning_rate": 1.1185801310044176e-05, "loss": 0.0009, "num_tokens": 1501626509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5163437740035846, "frac_reward_zero_std": 1.0, "grad_norm": 4.709078541839492e-19, "kl": 0.023773193359375, "learning_rate": 1.1179885251077418e-05, "loss": 0.001, "num_tokens": 1502196093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.51651446616028, "frac_reward_zero_std": 1.0, "grad_norm": 3.9178927743058485e-19, "kl": 0.023712158203125, "learning_rate": 1.117396877329399e-05, "loss": 0.0009, "num_tokens": 1502761645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5166851583169754, "frac_reward_zero_std": 1.0, "grad_norm": 4.12401392916799e-19, "kl": 0.02288818359375, "learning_rate": 1.1168051878794021e-05, "loss": 0.0009, "num_tokens": 1503326509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5168558504736708, "frac_reward_zero_std": 1.0, "grad_norm": 4.492663943716527e-19, "kl": 0.023681640625, "learning_rate": 1.1162134569677805e-05, "loss": 0.0009, "num_tokens": 1503887517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5170265426303662, "frac_reward_zero_std": 1.0, "grad_norm": 4.558759406769469e-19, "kl": 0.02276611328125, "learning_rate": 1.1156216848045764e-05, "loss": 0.0009, "num_tokens": 1504448893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5171972347870616, "frac_reward_zero_std": 1.0, "grad_norm": 4.422755875405851e-19, "kl": 0.0224609375, "learning_rate": 1.1150298715998482e-05, "loss": 0.0009, "num_tokens": 1505016317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5173679269437569, "frac_reward_zero_std": 1.0, "grad_norm": 4.379296546437326e-19, "kl": 0.023651123046875, "learning_rate": 1.1144380175636672e-05, "loss": 0.0009, "num_tokens": 1505584205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5175386191004523, "frac_reward_zero_std": 1.0, "grad_norm": 4.579808321164901e-19, "kl": 0.023406982421875, "learning_rate": 1.113846122906122e-05, "loss": 0.0009, "num_tokens": 1506150829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5177093112571477, "frac_reward_zero_std": 1.0, "grad_norm": 4.630982746704112e-19, "kl": 0.023529052734375, "learning_rate": 1.113254187837312e-05, "loss": 0.0009, "num_tokens": 1506713741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5178800034138431, "frac_reward_zero_std": 1.0, "grad_norm": 4.111111700394335e-19, "kl": 0.0230712890625, "learning_rate": 1.1126622125673538e-05, "loss": 0.0009, "num_tokens": 1507281693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5180506955705385, "frac_reward_zero_std": 1.0, "grad_norm": 4.2589055775959916e-19, "kl": 0.02325439453125, "learning_rate": 1.1120701973063768e-05, "loss": 0.0009, "num_tokens": 1507846909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5182213877272339, "frac_reward_zero_std": 1.0, "grad_norm": 4.354469978293135e-19, "kl": 0.023651123046875, "learning_rate": 1.1114781422645255e-05, "loss": 0.0009, "num_tokens": 1508413821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5183920798839293, "frac_reward_zero_std": 1.0, "grad_norm": 4.2724802195567426e-19, "kl": 0.024658203125, "learning_rate": 1.110886047651958e-05, "loss": 0.001, "num_tokens": 1508977693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5185627720406247, "frac_reward_zero_std": 1.0, "grad_norm": 4.347719530159332e-19, "kl": 0.023590087890625, "learning_rate": 1.110293913678846e-05, "loss": 0.0009, "num_tokens": 1509550637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5187334641973201, "frac_reward_zero_std": 1.0, "grad_norm": 4.535860895741967e-19, "kl": 0.025115966796875, "learning_rate": 1.109701740555376e-05, "loss": 0.001, "num_tokens": 1510117405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5189041563540155, "frac_reward_zero_std": 1.0, "grad_norm": 4.209188477496425e-19, "kl": 0.022735595703125, "learning_rate": 1.1091095284917483e-05, "loss": 0.0009, "num_tokens": 1510682669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.519074848510711, "frac_reward_zero_std": 1.0, "grad_norm": 4.798691725807496e-19, "kl": 0.02392578125, "learning_rate": 1.1085172776981766e-05, "loss": 0.001, "num_tokens": 1511246589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5192455406674064, "frac_reward_zero_std": 1.0, "grad_norm": 4.3005269482516203e-19, "kl": 0.02313232421875, "learning_rate": 1.1079249883848887e-05, "loss": 0.0009, "num_tokens": 1511813853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5194162328241018, "frac_reward_zero_std": 1.0, "grad_norm": 4.752182273853381e-19, "kl": 0.023529052734375, "learning_rate": 1.1073326607621255e-05, "loss": 0.0009, "num_tokens": 1512376941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5195869249807972, "frac_reward_zero_std": 1.0, "grad_norm": 4.2240925390090116e-19, "kl": 0.0223388671875, "learning_rate": 1.1067402950401422e-05, "loss": 0.0009, "num_tokens": 1512939677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5197576171374926, "frac_reward_zero_std": 1.0, "grad_norm": 4.3025085620798753e-19, "kl": 0.023162841796875, "learning_rate": 1.1061478914292075e-05, "loss": 0.0009, "num_tokens": 1513510893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.519928309294188, "frac_reward_zero_std": 1.0, "grad_norm": 4.3014567584186516e-19, "kl": 0.024444580078125, "learning_rate": 1.1055554501396029e-05, "loss": 0.001, "num_tokens": 1514072669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5200990014508833, "frac_reward_zero_std": 1.0, "grad_norm": 4.979318492766103e-19, "kl": 0.02435302734375, "learning_rate": 1.104962971381624e-05, "loss": 0.001, "num_tokens": 1514639245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5202696936075787, "frac_reward_zero_std": 1.0, "grad_norm": 4.3140726544996067e-19, "kl": 0.022796630859375, "learning_rate": 1.104370455365579e-05, "loss": 0.0009, "num_tokens": 1515196861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5204403857642741, "frac_reward_zero_std": 1.0, "grad_norm": 4.521757597524601e-19, "kl": 0.023162841796875, "learning_rate": 1.10377790230179e-05, "loss": 0.0009, "num_tokens": 1515763773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5206110779209695, "frac_reward_zero_std": 1.0, "grad_norm": 4.2829596679600883e-19, "kl": 0.023406982421875, "learning_rate": 1.103185312400592e-05, "loss": 0.0009, "num_tokens": 1516329805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5207817700776649, "frac_reward_zero_std": 1.0, "grad_norm": 4.488288837834085e-19, "kl": 0.02392578125, "learning_rate": 1.1025926858723327e-05, "loss": 0.001, "num_tokens": 1516893037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5209524622343603, "frac_reward_zero_std": 1.0, "grad_norm": 4.339706341893213e-19, "kl": 0.02386474609375, "learning_rate": 1.1020000229273732e-05, "loss": 0.001, "num_tokens": 1517468365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5211231543910557, "frac_reward_zero_std": 1.0, "grad_norm": 4.491266569489665e-19, "kl": 0.02349853515625, "learning_rate": 1.1014073237760879e-05, "loss": 0.0009, "num_tokens": 1518032189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5212938465477511, "frac_reward_zero_std": 1.0, "grad_norm": 4.538157145705946e-19, "kl": 0.023529052734375, "learning_rate": 1.1008145886288629e-05, "loss": 0.0009, "num_tokens": 1518597197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5214645387044465, "frac_reward_zero_std": 1.0, "grad_norm": 3.8603649305386056e-19, "kl": 0.023101806640625, "learning_rate": 1.1002218176960982e-05, "loss": 0.0009, "num_tokens": 1519161581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5216352308611419, "frac_reward_zero_std": 1.0, "grad_norm": 4.488550923942406e-19, "kl": 0.023223876953125, "learning_rate": 1.0996290111882062e-05, "loss": 0.0009, "num_tokens": 1519723997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5218059230178373, "frac_reward_zero_std": 1.0, "grad_norm": 4.153223259539378e-19, "kl": 0.022979736328125, "learning_rate": 1.0990361693156115e-05, "loss": 0.0009, "num_tokens": 1520288701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5219766151745328, "frac_reward_zero_std": 1.0, "grad_norm": 4.553804708819898e-19, "kl": 0.02490234375, "learning_rate": 1.0984432922887516e-05, "loss": 0.001, "num_tokens": 1520860573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5221473073312282, "frac_reward_zero_std": 1.0, "grad_norm": 4.671533796573888e-19, "kl": 0.0245361328125, "learning_rate": 1.0978503803180767e-05, "loss": 0.001, "num_tokens": 1521433613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5223179994879236, "frac_reward_zero_std": 1.0, "grad_norm": 5.151321957659965e-19, "kl": 0.024993896484375, "learning_rate": 1.097257433614049e-05, "loss": 0.001, "num_tokens": 1521995581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.522488691644619, "frac_reward_zero_std": 1.0, "grad_norm": 4.471549634171654e-19, "kl": 0.023284912109375, "learning_rate": 1.0966644523871431e-05, "loss": 0.0009, "num_tokens": 1522556093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5226593838013144, "frac_reward_zero_std": 1.0, "grad_norm": 4.479173433703974e-19, "kl": 0.023956298828125, "learning_rate": 1.0960714368478458e-05, "loss": 0.001, "num_tokens": 1523123069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5228300759580097, "frac_reward_zero_std": 1.0, "grad_norm": 4.348210285081088e-19, "kl": 0.024383544921875, "learning_rate": 1.0954783872066566e-05, "loss": 0.001, "num_tokens": 1523685069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5230007681147051, "frac_reward_zero_std": 1.0, "grad_norm": 4.2385550399635594e-19, "kl": 0.022674560546875, "learning_rate": 1.0948853036740865e-05, "loss": 0.0009, "num_tokens": 1524254829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5231714602714005, "frac_reward_zero_std": 1.0, "grad_norm": 4.3014468221097577e-19, "kl": 0.023681640625, "learning_rate": 1.0942921864606588e-05, "loss": 0.0009, "num_tokens": 1524821485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5233421524280959, "frac_reward_zero_std": 1.0, "grad_norm": 3.9799782826479415e-19, "kl": 0.022613525390625, "learning_rate": 1.0936990357769083e-05, "loss": 0.0009, "num_tokens": 1525383597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5235128445847913, "frac_reward_zero_std": 1.0, "grad_norm": 4.619589597122094e-19, "kl": 0.02410888671875, "learning_rate": 1.0931058518333831e-05, "loss": 0.001, "num_tokens": 1525956877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5236835367414867, "frac_reward_zero_std": 1.0, "grad_norm": 4.2690415469880196e-19, "kl": 0.022796630859375, "learning_rate": 1.0925126348406407e-05, "loss": 0.0009, "num_tokens": 1526522989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5238542288981821, "frac_reward_zero_std": 1.0, "grad_norm": 4.0374579099452476e-19, "kl": 0.02264404296875, "learning_rate": 1.0919193850092531e-05, "loss": 0.0009, "num_tokens": 1527103037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5240249210548775, "frac_reward_zero_std": 1.0, "grad_norm": 4.2171512092562656e-19, "kl": 0.02349853515625, "learning_rate": 1.0913261025498014e-05, "loss": 0.0009, "num_tokens": 1527670717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5241956132115729, "frac_reward_zero_std": 1.0, "grad_norm": 4.337755957869359e-19, "kl": 0.02337646484375, "learning_rate": 1.0907327876728805e-05, "loss": 0.0009, "num_tokens": 1528234829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5243663053682683, "frac_reward_zero_std": 1.0, "grad_norm": 4.271942188330541e-19, "kl": 0.02386474609375, "learning_rate": 1.0901394405890947e-05, "loss": 0.001, "num_tokens": 1528803501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5245369975249637, "frac_reward_zero_std": 1.0, "grad_norm": 4.710823006491065e-19, "kl": 0.023345947265625, "learning_rate": 1.0895460615090618e-05, "loss": 0.0009, "num_tokens": 1529369277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5247076896816592, "frac_reward_zero_std": 1.0, "grad_norm": 4.861547422315059e-19, "kl": 0.024444580078125, "learning_rate": 1.0889526506434092e-05, "loss": 0.001, "num_tokens": 1529937149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5248783818383546, "frac_reward_zero_std": 1.0, "grad_norm": 4.40168433240581e-19, "kl": 0.023284912109375, "learning_rate": 1.0883592082027767e-05, "loss": 0.0009, "num_tokens": 1530500093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.52504907399505, "frac_reward_zero_std": 1.0, "grad_norm": 4.098632385370174e-19, "kl": 0.023773193359375, "learning_rate": 1.0877657343978147e-05, "loss": 0.0009, "num_tokens": 1531066861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5252197661517454, "frac_reward_zero_std": 1.0, "grad_norm": 4.459981632508534e-19, "kl": 0.023101806640625, "learning_rate": 1.0871722294391854e-05, "loss": 0.0009, "num_tokens": 1531630061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5253904583084408, "frac_reward_zero_std": 1.0, "grad_norm": 4.080008146108458e-19, "kl": 0.022247314453125, "learning_rate": 1.0865786935375612e-05, "loss": 0.0009, "num_tokens": 1532197181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5255611504651361, "frac_reward_zero_std": 1.0, "grad_norm": 4.683714638951942e-19, "kl": 0.024261474609375, "learning_rate": 1.0859851269036263e-05, "loss": 0.001, "num_tokens": 1532766493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5257318426218315, "frac_reward_zero_std": 1.0, "grad_norm": 3.9629047043508975e-19, "kl": 0.0223388671875, "learning_rate": 1.0853915297480753e-05, "loss": 0.0009, "num_tokens": 1533329037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5259025347785269, "frac_reward_zero_std": 1.0, "grad_norm": 4.1251726345208677e-19, "kl": 0.023162841796875, "learning_rate": 1.0847979022816135e-05, "loss": 0.0009, "num_tokens": 1533891789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5260732269352223, "frac_reward_zero_std": 1.0, "grad_norm": 3.936024427043874e-19, "kl": 0.022308349609375, "learning_rate": 1.0842042447149574e-05, "loss": 0.0009, "num_tokens": 1534456189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5262439190919177, "frac_reward_zero_std": 1.0, "grad_norm": 4.1621252072032705e-19, "kl": 0.022979736328125, "learning_rate": 1.0836105572588343e-05, "loss": 0.0009, "num_tokens": 1535017917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5264146112486131, "frac_reward_zero_std": 1.0, "grad_norm": 4.066851207072463e-19, "kl": 0.023040771484375, "learning_rate": 1.0830168401239811e-05, "loss": 0.0009, "num_tokens": 1535584509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5265853034053085, "frac_reward_zero_std": 1.0, "grad_norm": 4.40594999997509e-19, "kl": 0.023773193359375, "learning_rate": 1.0824230935211468e-05, "loss": 0.001, "num_tokens": 1536153597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5267559955620039, "frac_reward_zero_std": 1.0, "grad_norm": 375803119890.09204, "kl": 22817013760.0, "learning_rate": 1.0818293176610895e-05, "loss": 914358272.0, "num_tokens": 1536750605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5269266877186993, "frac_reward_zero_std": 1.0, "grad_norm": 1.7759490192437985e-15, "kl": 0.023468017578125, "learning_rate": 1.0812355127545781e-05, "loss": 0.0009, "num_tokens": 1537318653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5270973798753947, "frac_reward_zero_std": 1.0, "grad_norm": 3.508494037856558e-13, "kl": 0.0225830078125, "learning_rate": 1.0806416790123921e-05, "loss": 0.0009, "num_tokens": 1537883213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5272680720320901, "frac_reward_zero_std": 1.0, "grad_norm": 1.0637760115498147e-11, "kl": 0.0235595703125, "learning_rate": 1.0800478166453215e-05, "loss": 0.0009, "num_tokens": 1538454701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5274387641887855, "frac_reward_zero_std": 1.0, "grad_norm": 1.0832849959717709e-10, "kl": 0.024200439453125, "learning_rate": 1.0794539258641649e-05, "loss": 0.001, "num_tokens": 1539022157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.527609456345481, "frac_reward_zero_std": 1.0, "grad_norm": 2.1996073814369475e-09, "kl": 0.02398681640625, "learning_rate": 1.0788600068797333e-05, "loss": 0.001, "num_tokens": 1539588461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5277801485021764, "frac_reward_zero_std": 1.0, "grad_norm": 1.2469981694076412e-08, "kl": 0.0230712890625, "learning_rate": 1.0782660599028456e-05, "loss": 0.0009, "num_tokens": 1540152653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5279508406588718, "frac_reward_zero_std": 1.0, "grad_norm": 1.6879652715999246e-05, "kl": 0.0230712890625, "learning_rate": 1.077672085144332e-05, "loss": 0.0009, "num_tokens": 1540723149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5281215328155672, "frac_reward_zero_std": 1.0, "grad_norm": 2.9102156719154717e-05, "kl": 0.0238037109375, "learning_rate": 1.077078082815032e-05, "loss": 0.001, "num_tokens": 1541287261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5282922249722625, "frac_reward_zero_std": 1.0, "grad_norm": 6.364647065926698e-05, "kl": 0.0233154296875, "learning_rate": 1.076484053125795e-05, "loss": 0.0009, "num_tokens": 1541859917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5284629171289579, "frac_reward_zero_std": 1.0, "grad_norm": 0.00018585963431434844, "kl": 0.02374267578125, "learning_rate": 1.0758899962874798e-05, "loss": 0.001, "num_tokens": 1542424013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5286336092856533, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002761590689164263, "kl": 0.024993896484375, "learning_rate": 1.0752959125109556e-05, "loss": 0.001, "num_tokens": 1542989437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5288043014423487, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005071904432368894, "kl": 0.025360107421875, "learning_rate": 1.0747018020071005e-05, "loss": 0.001, "num_tokens": 1543555741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5289749935990441, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008391755638907761, "kl": 0.02587890625, "learning_rate": 1.0741076649868023e-05, "loss": 0.001, "num_tokens": 1544124669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5291456857557395, "frac_reward_zero_std": 1.0, "grad_norm": 0.008683336025132984, "kl": 0.0306396484375, "learning_rate": 1.073513501660958e-05, "loss": 0.0012, "num_tokens": 1544694237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5293163779124349, "frac_reward_zero_std": 1.0, "grad_norm": 0.001181005978794406, "kl": 0.02655029296875, "learning_rate": 1.0729193122404746e-05, "loss": 0.0011, "num_tokens": 1545287773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5294870700691303, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017985649605761183, "kl": 0.028045654296875, "learning_rate": 1.0723250969362671e-05, "loss": 0.0011, "num_tokens": 1545855133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5296577622258257, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024895472123011553, "kl": 0.03021240234375, "learning_rate": 1.0717308559592615e-05, "loss": 0.0012, "num_tokens": 1546418061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5298284543825211, "frac_reward_zero_std": 1.0, "grad_norm": 0.003851394116126748, "kl": 0.032470703125, "learning_rate": 1.0711365895203909e-05, "loss": 0.0013, "num_tokens": 1546979117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5299991465392165, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021910212396880967, "kl": 0.0333251953125, "learning_rate": 1.0705422978305993e-05, "loss": 0.0013, "num_tokens": 1547550237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5301698386959119, "frac_reward_zero_std": 1.0, "grad_norm": 0.008285752698400652, "kl": 0.03778076171875, "learning_rate": 1.069947981100838e-05, "loss": 0.0015, "num_tokens": 1548117085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5303405308526074, "frac_reward_zero_std": 1.0, "grad_norm": 0.014910398560259573, "kl": 0.0458984375, "learning_rate": 1.069353639542069e-05, "loss": 0.0018, "num_tokens": 1548684829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5305112230093028, "frac_reward_zero_std": 1.0, "grad_norm": 0.010067925988195, "kl": 0.0374755859375, "learning_rate": 1.0687592733652607e-05, "loss": 0.0015, "num_tokens": 1549252397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5306819151659982, "frac_reward_zero_std": 1.0, "grad_norm": 0.004150188731724676, "kl": 0.0386962890625, "learning_rate": 1.0681648827813928e-05, "loss": 0.0015, "num_tokens": 1549813853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5308526073226936, "frac_reward_zero_std": 1.0, "grad_norm": 0.004855810571814982, "kl": 0.03643798828125, "learning_rate": 1.0675704680014523e-05, "loss": 0.0015, "num_tokens": 1550383037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.531023299479389, "frac_reward_zero_std": 1.0, "grad_norm": 0.00412848890531056, "kl": 0.03692626953125, "learning_rate": 1.0669760292364346e-05, "loss": 0.0015, "num_tokens": 1550945693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5311939916360843, "frac_reward_zero_std": 1.0, "grad_norm": 0.009244502202177502, "kl": 0.041748046875, "learning_rate": 1.0663815666973443e-05, "loss": 0.0017, "num_tokens": 1551510381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5313646837927797, "frac_reward_zero_std": 1.0, "grad_norm": 0.0059468599053623, "kl": 0.03778076171875, "learning_rate": 1.065787080595194e-05, "loss": 0.0015, "num_tokens": 1552077757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5315353759494751, "frac_reward_zero_std": 1.0, "grad_norm": 0.005237459425038348, "kl": 0.03704833984375, "learning_rate": 1.0651925711410049e-05, "loss": 0.0015, "num_tokens": 1552645085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5317060681061705, "frac_reward_zero_std": 1.0, "grad_norm": 0.010502907795016218, "kl": 0.0447998046875, "learning_rate": 1.0645980385458063e-05, "loss": 0.0018, "num_tokens": 1553213341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5318767602628659, "frac_reward_zero_std": 1.0, "grad_norm": 0.007046246307088773, "kl": 0.04217529296875, "learning_rate": 1.0640034830206361e-05, "loss": 0.0017, "num_tokens": 1553779757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5320474524195613, "frac_reward_zero_std": 1.0, "grad_norm": 0.004979574325289659, "kl": 0.04071044921875, "learning_rate": 1.0634089047765394e-05, "loss": 0.0016, "num_tokens": 1554341613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5322181445762567, "frac_reward_zero_std": 1.0, "grad_norm": 0.004541598332876309, "kl": 0.0361328125, "learning_rate": 1.0628143040245708e-05, "loss": 0.0014, "num_tokens": 1554903837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5323888367329521, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034215213567667796, "kl": 0.03924560546875, "learning_rate": 1.0622196809757916e-05, "loss": 0.0016, "num_tokens": 1555472845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5325595288896475, "frac_reward_zero_std": 1.0, "grad_norm": 0.027893193171370498, "kl": 0.0450439453125, "learning_rate": 1.0616250358412717e-05, "loss": 0.0018, "num_tokens": 1556040845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5327302210463429, "frac_reward_zero_std": 1.0, "grad_norm": 0.011575189646087463, "kl": 0.0406494140625, "learning_rate": 1.0610303688320886e-05, "loss": 0.0016, "num_tokens": 1556609293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5329009132030383, "frac_reward_zero_std": 1.0, "grad_norm": 0.007740165319880107, "kl": 0.040283203125, "learning_rate": 1.0604356801593276e-05, "loss": 0.0016, "num_tokens": 1557183965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5330716053597337, "frac_reward_zero_std": 1.0, "grad_norm": 0.006585493818971356, "kl": 0.03753662109375, "learning_rate": 1.059840970034082e-05, "loss": 0.0015, "num_tokens": 1557748285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5332422975164292, "frac_reward_zero_std": 1.0, "grad_norm": 0.0057578947702932775, "kl": 0.0369873046875, "learning_rate": 1.0592462386674518e-05, "loss": 0.0015, "num_tokens": 1558310957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5334129896731246, "frac_reward_zero_std": 1.0, "grad_norm": 0.0060948986931926965, "kl": 0.0391845703125, "learning_rate": 1.0586514862705457e-05, "loss": 0.0016, "num_tokens": 1558877293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.53358368182982, "frac_reward_zero_std": 1.0, "grad_norm": 0.05022405308010261, "kl": 0.0445556640625, "learning_rate": 1.0580567130544792e-05, "loss": 0.0018, "num_tokens": 1559440605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5337543739865154, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038557033789715606, "kl": 0.0411376953125, "learning_rate": 1.0574619192303752e-05, "loss": 0.0016, "num_tokens": 1560008541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5339250661432107, "frac_reward_zero_std": 1.0, "grad_norm": 0.005822499437556699, "kl": 0.03814697265625, "learning_rate": 1.056867105009364e-05, "loss": 0.0015, "num_tokens": 1560578589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5340957582999061, "frac_reward_zero_std": 1.0, "grad_norm": 0.012175548390478812, "kl": 0.04388427734375, "learning_rate": 1.0562722706025836e-05, "loss": 0.0018, "num_tokens": 1561149709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5342664504566015, "frac_reward_zero_std": 1.0, "grad_norm": 0.00536568069347352, "kl": 0.040771484375, "learning_rate": 1.0556774162211782e-05, "loss": 0.0016, "num_tokens": 1561713453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5344371426132969, "frac_reward_zero_std": 1.0, "grad_norm": 0.005377427378306173, "kl": 0.03875732421875, "learning_rate": 1.0550825420763002e-05, "loss": 0.0015, "num_tokens": 1562279485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5346078347699923, "frac_reward_zero_std": 1.0, "grad_norm": 0.005376917179913529, "kl": 0.0380859375, "learning_rate": 1.0544876483791078e-05, "loss": 0.0015, "num_tokens": 1562844461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5347785269266877, "frac_reward_zero_std": 1.0, "grad_norm": 0.0055042369626935135, "kl": 0.038330078125, "learning_rate": 1.0538927353407672e-05, "loss": 0.0015, "num_tokens": 1563419165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5349492190833831, "frac_reward_zero_std": 1.0, "grad_norm": 0.08066193187172778, "kl": 0.047119140625, "learning_rate": 1.053297803172451e-05, "loss": 0.0019, "num_tokens": 1563982573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5351199112400785, "frac_reward_zero_std": 1.0, "grad_norm": 0.004655235493917693, "kl": 0.038330078125, "learning_rate": 1.0527028520853388e-05, "loss": 0.0015, "num_tokens": 1564541085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5352906033967739, "frac_reward_zero_std": 1.0, "grad_norm": 0.005377479535984946, "kl": 0.03900146484375, "learning_rate": 1.0521078822906164e-05, "loss": 0.0016, "num_tokens": 1565111773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5354612955534693, "frac_reward_zero_std": 1.0, "grad_norm": 0.022437753369448792, "kl": 0.04217529296875, "learning_rate": 1.051512893999477e-05, "loss": 0.0017, "num_tokens": 1565672733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5356319877101647, "frac_reward_zero_std": 1.0, "grad_norm": 0.006824091475382076, "kl": 0.0416259765625, "learning_rate": 1.0509178874231191e-05, "loss": 0.0017, "num_tokens": 1566235229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5358026798668601, "frac_reward_zero_std": 1.0, "grad_norm": 0.006015957457018175, "kl": 0.040283203125, "learning_rate": 1.0503228627727497e-05, "loss": 0.0016, "num_tokens": 1566798749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5359733720235555, "frac_reward_zero_std": 1.0, "grad_norm": 0.009459204198997028, "kl": 0.04278564453125, "learning_rate": 1.0497278202595804e-05, "loss": 0.0017, "num_tokens": 1567366189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.536144064180251, "frac_reward_zero_std": 1.0, "grad_norm": 0.004965455654110669, "kl": 0.04376220703125, "learning_rate": 1.0491327600948302e-05, "loss": 0.0018, "num_tokens": 1567937517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5363147563369464, "frac_reward_zero_std": 1.0, "grad_norm": 0.04433297651043302, "kl": 0.04974365234375, "learning_rate": 1.0485376824897236e-05, "loss": 0.002, "num_tokens": 1568499325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5364854484936418, "frac_reward_zero_std": 1.0, "grad_norm": 0.01018840318959021, "kl": 0.04486083984375, "learning_rate": 1.0479425876554919e-05, "loss": 0.0018, "num_tokens": 1569066717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5366561406503371, "frac_reward_zero_std": 1.0, "grad_norm": 0.006401773753765077, "kl": 0.0438232421875, "learning_rate": 1.0473474758033721e-05, "loss": 0.0018, "num_tokens": 1569629725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5368268328070325, "frac_reward_zero_std": 1.0, "grad_norm": 0.010034728902130455, "kl": 0.04803466796875, "learning_rate": 1.0467523471446077e-05, "loss": 0.0019, "num_tokens": 1570191597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5369975249637279, "frac_reward_zero_std": 1.0, "grad_norm": 0.02115977010019201, "kl": 0.04473876953125, "learning_rate": 1.0461572018904476e-05, "loss": 0.0018, "num_tokens": 1570753869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5371682171204233, "frac_reward_zero_std": 1.0, "grad_norm": 0.016287432785788463, "kl": 0.04815673828125, "learning_rate": 1.045562040252147e-05, "loss": 0.0019, "num_tokens": 1571319917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5373389092771187, "frac_reward_zero_std": 1.0, "grad_norm": 0.008996393301476521, "kl": 0.04534912109375, "learning_rate": 1.0449668624409668e-05, "loss": 0.0018, "num_tokens": 1571885677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5375096014338141, "frac_reward_zero_std": 1.0, "grad_norm": 0.007719886512877729, "kl": 0.04443359375, "learning_rate": 1.0443716686681738e-05, "loss": 0.0018, "num_tokens": 1572450109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5376802935905095, "frac_reward_zero_std": 1.0, "grad_norm": 0.00924603482799371, "kl": 0.042236328125, "learning_rate": 1.0437764591450398e-05, "loss": 0.0017, "num_tokens": 1573014861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5378509857472049, "frac_reward_zero_std": 1.0, "grad_norm": 0.007298596270501374, "kl": 0.041259765625, "learning_rate": 1.0431812340828433e-05, "loss": 0.0017, "num_tokens": 1573580333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5380216779039003, "frac_reward_zero_std": 1.0, "grad_norm": 0.006316834787005757, "kl": 0.0396728515625, "learning_rate": 1.0425859936928675e-05, "loss": 0.0016, "num_tokens": 1574152861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5381923700605957, "frac_reward_zero_std": 1.0, "grad_norm": 0.019495603991197843, "kl": 0.04193115234375, "learning_rate": 1.0419907381864012e-05, "loss": 0.0017, "num_tokens": 1574712861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5383630622172911, "frac_reward_zero_std": 1.0, "grad_norm": 0.06930851486629025, "kl": 0.05096435546875, "learning_rate": 1.0413954677747385e-05, "loss": 0.002, "num_tokens": 1575277197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5385337543739865, "frac_reward_zero_std": 1.0, "grad_norm": 0.01879267637759473, "kl": 0.04412841796875, "learning_rate": 1.0408001826691792e-05, "loss": 0.0018, "num_tokens": 1575843981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.538704446530682, "frac_reward_zero_std": 1.0, "grad_norm": 0.013927925368803294, "kl": 0.048095703125, "learning_rate": 1.0402048830810277e-05, "loss": 0.0019, "num_tokens": 1576408717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5388751386873774, "frac_reward_zero_std": 1.0, "grad_norm": 0.00915544872797159, "kl": 0.04327392578125, "learning_rate": 1.039609569221594e-05, "loss": 0.0017, "num_tokens": 1576976141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5390458308440728, "frac_reward_zero_std": 1.0, "grad_norm": 0.01934030662443151, "kl": 0.0521240234375, "learning_rate": 1.0390142413021929e-05, "loss": 0.0021, "num_tokens": 1577543645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5392165230007682, "frac_reward_zero_std": 1.0, "grad_norm": 0.018247018678440998, "kl": 0.0523681640625, "learning_rate": 1.0384188995341449e-05, "loss": 0.0021, "num_tokens": 1578118717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5393872151574635, "frac_reward_zero_std": 1.0, "grad_norm": 0.010861996737821162, "kl": 0.04443359375, "learning_rate": 1.037823544128774e-05, "loss": 0.0018, "num_tokens": 1578686077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5395579073141589, "frac_reward_zero_std": 1.0, "grad_norm": 0.00790271974094161, "kl": 0.04144287109375, "learning_rate": 1.0372281752974108e-05, "loss": 0.0017, "num_tokens": 1579248269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5397285994708543, "frac_reward_zero_std": 1.0, "grad_norm": 0.04552916649664842, "kl": 0.05255126953125, "learning_rate": 1.0366327932513886e-05, "loss": 0.0021, "num_tokens": 1579826061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5398992916275497, "frac_reward_zero_std": 1.0, "grad_norm": 0.0051608765856493485, "kl": 0.0430908203125, "learning_rate": 1.0360373982020478e-05, "loss": 0.0017, "num_tokens": 1580393309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5400699837842451, "frac_reward_zero_std": 1.0, "grad_norm": 0.04035900827147034, "kl": 0.08880615234375, "learning_rate": 1.0354419903607308e-05, "loss": 0.0036, "num_tokens": 1580988493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5402406759409405, "frac_reward_zero_std": 1.0, "grad_norm": 0.009504080252358788, "kl": 0.0439453125, "learning_rate": 1.0348465699387873e-05, "loss": 0.0018, "num_tokens": 1581550893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5404113680976359, "frac_reward_zero_std": 1.0, "grad_norm": 0.03297933184111878, "kl": 0.05841064453125, "learning_rate": 1.0342511371475687e-05, "loss": 0.0023, "num_tokens": 1582108957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5405820602543313, "frac_reward_zero_std": 1.0, "grad_norm": 0.007757585673568342, "kl": 0.04608154296875, "learning_rate": 1.0336556921984333e-05, "loss": 0.0018, "num_tokens": 1582673053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5407527524110267, "frac_reward_zero_std": 1.0, "grad_norm": 0.017438616106168658, "kl": 0.05010986328125, "learning_rate": 1.033060235302742e-05, "loss": 0.002, "num_tokens": 1583238045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5409234445677221, "frac_reward_zero_std": 1.0, "grad_norm": 0.007298009902293215, "kl": 0.04205322265625, "learning_rate": 1.0324647666718604e-05, "loss": 0.0017, "num_tokens": 1583805245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5410941367244175, "frac_reward_zero_std": 1.0, "grad_norm": 0.019374703597737703, "kl": 0.04449462890625, "learning_rate": 1.0318692865171585e-05, "loss": 0.0018, "num_tokens": 1584379389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5412648288811129, "frac_reward_zero_std": 1.0, "grad_norm": 0.010142192202182144, "kl": 0.04754638671875, "learning_rate": 1.0312737950500105e-05, "loss": 0.0019, "num_tokens": 1584947037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5414355210378083, "frac_reward_zero_std": 1.0, "grad_norm": 0.026677540251751175, "kl": 0.0592041015625, "learning_rate": 1.0306782924817941e-05, "loss": 0.0024, "num_tokens": 1585512781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5416062131945037, "frac_reward_zero_std": 1.0, "grad_norm": 0.01626860794426036, "kl": 0.0462646484375, "learning_rate": 1.0300827790238914e-05, "loss": 0.0019, "num_tokens": 1586081293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5417769053511992, "frac_reward_zero_std": 1.0, "grad_norm": 0.016550680805983164, "kl": 0.04864501953125, "learning_rate": 1.029487254887688e-05, "loss": 0.0019, "num_tokens": 1586654013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5419475975078946, "frac_reward_zero_std": 1.0, "grad_norm": 0.026131703775132818, "kl": 0.05413818359375, "learning_rate": 1.0288917202845735e-05, "loss": 0.0022, "num_tokens": 1587227117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5421182896645899, "frac_reward_zero_std": 1.0, "grad_norm": 0.006205396059866036, "kl": 0.04937744140625, "learning_rate": 1.0282961754259414e-05, "loss": 0.002, "num_tokens": 1587787821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5422889818212853, "frac_reward_zero_std": 1.0, "grad_norm": 0.019697066218991602, "kl": 0.04925537109375, "learning_rate": 1.0277006205231882e-05, "loss": 0.002, "num_tokens": 1588350685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5424596739779807, "frac_reward_zero_std": 1.0, "grad_norm": 0.011938461552971302, "kl": 0.0550537109375, "learning_rate": 1.0271050557877147e-05, "loss": 0.0022, "num_tokens": 1588917469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5426303661346761, "frac_reward_zero_std": 1.0, "grad_norm": 0.013943260169820842, "kl": 0.0474853515625, "learning_rate": 1.0265094814309249e-05, "loss": 0.0019, "num_tokens": 1589482173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5428010582913715, "frac_reward_zero_std": 1.0, "grad_norm": 0.01673079347974712, "kl": 0.05096435546875, "learning_rate": 1.025913897664226e-05, "loss": 0.002, "num_tokens": 1590046957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5429717504480669, "frac_reward_zero_std": 1.0, "grad_norm": 0.026316730044778316, "kl": 0.046630859375, "learning_rate": 1.0253183046990292e-05, "loss": 0.0019, "num_tokens": 1590618365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5431424426047623, "frac_reward_zero_std": 1.0, "grad_norm": 0.01982091480123325, "kl": 0.063232421875, "learning_rate": 1.0247227027467479e-05, "loss": 0.0025, "num_tokens": 1591178701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5433131347614577, "frac_reward_zero_std": 1.0, "grad_norm": 0.014502449844034513, "kl": 0.048828125, "learning_rate": 1.0241270920187996e-05, "loss": 0.002, "num_tokens": 1591746957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5434838269181531, "frac_reward_zero_std": 1.0, "grad_norm": 0.04362703470980549, "kl": 0.05224609375, "learning_rate": 1.0235314727266046e-05, "loss": 0.0021, "num_tokens": 1592316125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5436545190748485, "frac_reward_zero_std": 1.0, "grad_norm": 0.03292233591037256, "kl": 0.0518798828125, "learning_rate": 1.0229358450815864e-05, "loss": 0.0021, "num_tokens": 1592882861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5438252112315439, "frac_reward_zero_std": 1.0, "grad_norm": 0.011683750080755144, "kl": 0.04888916015625, "learning_rate": 1.0223402092951707e-05, "loss": 0.002, "num_tokens": 1593447533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5439959033882393, "frac_reward_zero_std": 1.0, "grad_norm": 0.028442665129601672, "kl": 0.05279541015625, "learning_rate": 1.0217445655787874e-05, "loss": 0.0021, "num_tokens": 1594014333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5441665955449347, "frac_reward_zero_std": 1.0, "grad_norm": 0.007903340956911852, "kl": 0.05267333984375, "learning_rate": 1.021148914143868e-05, "loss": 0.0021, "num_tokens": 1594582301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5443372877016301, "frac_reward_zero_std": 1.0, "grad_norm": 0.024694904551632837, "kl": 0.04962158203125, "learning_rate": 1.020553255201848e-05, "loss": 0.002, "num_tokens": 1595150669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5445079798583256, "frac_reward_zero_std": 1.0, "grad_norm": 0.013364300123582595, "kl": 0.0537109375, "learning_rate": 1.0199575889641638e-05, "loss": 0.0022, "num_tokens": 1595716909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.544678672015021, "frac_reward_zero_std": 1.0, "grad_norm": 0.01404296697106231, "kl": 0.049072265625, "learning_rate": 1.0193619156422566e-05, "loss": 0.002, "num_tokens": 1596287229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5448493641717163, "frac_reward_zero_std": 1.0, "grad_norm": 0.024441477991920167, "kl": 0.0545654296875, "learning_rate": 1.0187662354475675e-05, "loss": 0.0022, "num_tokens": 1596857069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5450200563284117, "frac_reward_zero_std": 1.0, "grad_norm": 0.005983147149066233, "kl": 0.04510498046875, "learning_rate": 1.0181705485915429e-05, "loss": 0.0018, "num_tokens": 1597419469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5451907484851071, "frac_reward_zero_std": 1.0, "grad_norm": 0.009919813787257002, "kl": 0.0601806640625, "learning_rate": 1.017574855285629e-05, "loss": 0.0024, "num_tokens": 1597989037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5453614406418025, "frac_reward_zero_std": 1.0, "grad_norm": 0.02176891210638326, "kl": 0.05438232421875, "learning_rate": 1.0169791557412763e-05, "loss": 0.0022, "num_tokens": 1598559661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5455321327984979, "frac_reward_zero_std": 1.0, "grad_norm": 0.07556539221729561, "kl": 0.06402587890625, "learning_rate": 1.0163834501699358e-05, "loss": 0.0026, "num_tokens": 1599125181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5457028249551933, "frac_reward_zero_std": 1.0, "grad_norm": 0.007651165142845625, "kl": 0.05084228515625, "learning_rate": 1.0157877387830623e-05, "loss": 0.002, "num_tokens": 1599687181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5458735171118887, "frac_reward_zero_std": 1.0, "grad_norm": 0.018646352460480554, "kl": 0.0635986328125, "learning_rate": 1.0151920217921111e-05, "loss": 0.0025, "num_tokens": 1600255293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5460442092685841, "frac_reward_zero_std": 1.0, "grad_norm": 0.01964041177180014, "kl": 0.053955078125, "learning_rate": 1.0145962994085407e-05, "loss": 0.0022, "num_tokens": 1600816781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5462149014252795, "frac_reward_zero_std": 1.0, "grad_norm": 0.015893752533627254, "kl": 0.05682373046875, "learning_rate": 1.014000571843811e-05, "loss": 0.0023, "num_tokens": 1601380509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5463855935819749, "frac_reward_zero_std": 1.0, "grad_norm": 0.011520129521647118, "kl": 0.0531005859375, "learning_rate": 1.0134048393093836e-05, "loss": 0.0021, "num_tokens": 1601944621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5465562857386703, "frac_reward_zero_std": 1.0, "grad_norm": 0.03678870879727455, "kl": 0.07073974609375, "learning_rate": 1.0128091020167222e-05, "loss": 0.0028, "num_tokens": 1602511853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5467269778953657, "frac_reward_zero_std": 1.0, "grad_norm": 0.7058162602114995, "kl": 0.1634521484375, "learning_rate": 1.0122133601772919e-05, "loss": 0.0066, "num_tokens": 1603080861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5468976700520611, "frac_reward_zero_std": 1.0, "grad_norm": 0.01597916013934051, "kl": 0.0960693359375, "learning_rate": 1.0116176140025596e-05, "loss": 0.0038, "num_tokens": 1603641149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5470683622087565, "frac_reward_zero_std": 1.0, "grad_norm": 0.07983731241989347, "kl": 0.16845703125, "learning_rate": 1.0110218637039937e-05, "loss": 0.0067, "num_tokens": 1604202381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.547239054365452, "frac_reward_zero_std": 1.0, "grad_norm": 0.20604770862900834, "kl": 0.27783203125, "learning_rate": 1.010426109493064e-05, "loss": 0.0111, "num_tokens": 1604774125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5474097465221474, "frac_reward_zero_std": 1.0, "grad_norm": 0.24187722112939594, "kl": 0.29638671875, "learning_rate": 1.009830351581242e-05, "loss": 0.0118, "num_tokens": 1605341853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5475804386788428, "frac_reward_zero_std": 1.0, "grad_norm": 0.12252332191550169, "kl": 0.310546875, "learning_rate": 1.0092345901799998e-05, "loss": 0.0124, "num_tokens": 1605910845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5477511308355381, "frac_reward_zero_std": 1.0, "grad_norm": 0.2858853253037462, "kl": 0.41162109375, "learning_rate": 1.0086388255008115e-05, "loss": 0.0165, "num_tokens": 1606474781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5479218229922335, "frac_reward_zero_std": 1.0, "grad_norm": 0.10431937597710898, "kl": 0.35791015625, "learning_rate": 1.008043057755152e-05, "loss": 0.0143, "num_tokens": 1607036893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5480925151489289, "frac_reward_zero_std": 1.0, "grad_norm": 0.19377728943336234, "kl": 0.29638671875, "learning_rate": 1.0074472871544973e-05, "loss": 0.0118, "num_tokens": 1607601069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5482632073056243, "frac_reward_zero_std": 1.0, "grad_norm": 0.32384236572621194, "kl": 0.300048828125, "learning_rate": 1.0068515139103248e-05, "loss": 0.012, "num_tokens": 1608161773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5484338994623197, "frac_reward_zero_std": 1.0, "grad_norm": 0.28116126651762113, "kl": 0.2158203125, "learning_rate": 1.0062557382341118e-05, "loss": 0.0086, "num_tokens": 1608725005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5486045916190151, "frac_reward_zero_std": 1.0, "grad_norm": 0.23132827458494504, "kl": 0.195068359375, "learning_rate": 1.0056599603373378e-05, "loss": 0.0078, "num_tokens": 1609286541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5487752837757105, "frac_reward_zero_std": 1.0, "grad_norm": 0.04342164609636493, "kl": 0.20068359375, "learning_rate": 1.005064180431482e-05, "loss": 0.008, "num_tokens": 1609849917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5489459759324059, "frac_reward_zero_std": 1.0, "grad_norm": 0.08324501826608866, "kl": 0.193603515625, "learning_rate": 1.004468398728025e-05, "loss": 0.0077, "num_tokens": 1610413549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5491166680891013, "frac_reward_zero_std": 1.0, "grad_norm": 0.030899199098576562, "kl": 0.180908203125, "learning_rate": 1.003872615438448e-05, "loss": 0.0072, "num_tokens": 1610988525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5492873602457967, "frac_reward_zero_std": 1.0, "grad_norm": 0.07496667488459624, "kl": 0.1470947265625, "learning_rate": 1.0032768307742317e-05, "loss": 0.0059, "num_tokens": 1611551661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5494580524024921, "frac_reward_zero_std": 1.0, "grad_norm": 0.19748466614782367, "kl": 0.1441650390625, "learning_rate": 1.0026810449468595e-05, "loss": 0.0058, "num_tokens": 1612115293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5496287445591875, "frac_reward_zero_std": 1.0, "grad_norm": 0.18507162807136707, "kl": 0.1761474609375, "learning_rate": 1.0020852581678126e-05, "loss": 0.0071, "num_tokens": 1612680605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5497994367158829, "frac_reward_zero_std": 1.0, "grad_norm": 0.11823770672216029, "kl": 0.18017578125, "learning_rate": 1.0014894706485748e-05, "loss": 0.0072, "num_tokens": 1613244845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5499701288725783, "frac_reward_zero_std": 1.0, "grad_norm": 0.10119325946854871, "kl": 0.1761474609375, "learning_rate": 1.0008936826006281e-05, "loss": 0.007, "num_tokens": 1613811213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5501408210292738, "frac_reward_zero_std": 1.0, "grad_norm": 0.1477471011737771, "kl": 0.2021484375, "learning_rate": 1.000297894235457e-05, "loss": 0.0081, "num_tokens": 1614374605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5503115131859692, "frac_reward_zero_std": 1.0, "grad_norm": 0.1707640378843799, "kl": 0.21435546875, "learning_rate": 9.997021057645434e-06, "loss": 0.0086, "num_tokens": 1614938925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5504822053426645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0532947234561921, "kl": 0.160888671875, "learning_rate": 9.99106317399372e-06, "loss": 0.0064, "num_tokens": 1615501709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5506528974993599, "frac_reward_zero_std": 1.0, "grad_norm": 0.04158658265944987, "kl": 0.138916015625, "learning_rate": 9.985105293514257e-06, "loss": 0.0056, "num_tokens": 1616067789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5508235896560553, "frac_reward_zero_std": 1.0, "grad_norm": 0.015858711622528755, "kl": 0.1246337890625, "learning_rate": 9.979147418321877e-06, "loss": 0.005, "num_tokens": 1616634077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5509942818127507, "frac_reward_zero_std": 1.0, "grad_norm": 0.025932618409159516, "kl": 0.100830078125, "learning_rate": 9.973189550531408e-06, "loss": 0.004, "num_tokens": 1617198397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5511649739694461, "frac_reward_zero_std": 1.0, "grad_norm": 0.01795358848765173, "kl": 0.09228515625, "learning_rate": 9.967231692257683e-06, "loss": 0.0037, "num_tokens": 1617759613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5513356661261415, "frac_reward_zero_std": 1.0, "grad_norm": 0.05482754920627327, "kl": 0.1063232421875, "learning_rate": 9.961273845615526e-06, "loss": 0.0043, "num_tokens": 1618327021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5515063582828369, "frac_reward_zero_std": 1.0, "grad_norm": 0.04185224108754727, "kl": 0.0992431640625, "learning_rate": 9.955316012719753e-06, "loss": 0.004, "num_tokens": 1618888717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5516770504395323, "frac_reward_zero_std": 1.0, "grad_norm": 0.015129802734704045, "kl": 0.0765380859375, "learning_rate": 9.949358195685182e-06, "loss": 0.0031, "num_tokens": 1619455757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5518477425962277, "frac_reward_zero_std": 1.0, "grad_norm": 0.11548465991694895, "kl": 0.10400390625, "learning_rate": 9.943400396626625e-06, "loss": 0.0042, "num_tokens": 1620023117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5520184347529231, "frac_reward_zero_std": 1.0, "grad_norm": 0.009021879650228179, "kl": 0.0806884765625, "learning_rate": 9.937442617658887e-06, "loss": 0.0032, "num_tokens": 1620589453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5521891269096185, "frac_reward_zero_std": 1.0, "grad_norm": 0.04793402923689905, "kl": 0.090576171875, "learning_rate": 9.931484860896755e-06, "loss": 0.0036, "num_tokens": 1621157949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5523598190663139, "frac_reward_zero_std": 1.0, "grad_norm": 0.08128682034119927, "kl": 0.1014404296875, "learning_rate": 9.925527128455029e-06, "loss": 0.0041, "num_tokens": 1621724621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5525305112230093, "frac_reward_zero_std": 1.0, "grad_norm": 0.0917241283625982, "kl": 0.110595703125, "learning_rate": 9.91956942244848e-06, "loss": 0.0044, "num_tokens": 1622289229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5527012033797047, "frac_reward_zero_std": 1.0, "grad_norm": 0.05105194356036741, "kl": 0.08740234375, "learning_rate": 9.91361174499189e-06, "loss": 0.0035, "num_tokens": 1622851501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5528718955364001, "frac_reward_zero_std": 1.0, "grad_norm": 0.020373566682897068, "kl": 0.0863037109375, "learning_rate": 9.907654098200005e-06, "loss": 0.0034, "num_tokens": 1623413021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5530425876930956, "frac_reward_zero_std": 1.0, "grad_norm": 0.01488612331732493, "kl": 0.103759765625, "learning_rate": 9.901696484187585e-06, "loss": 0.0041, "num_tokens": 1623979677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5532132798497909, "frac_reward_zero_std": 1.0, "grad_norm": 0.03641445248026615, "kl": 0.1099853515625, "learning_rate": 9.895738905069361e-06, "loss": 0.0044, "num_tokens": 1624538637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5533839720064863, "frac_reward_zero_std": 1.0, "grad_norm": 0.039241629858575294, "kl": 0.139892578125, "learning_rate": 9.889781362960067e-06, "loss": 0.0056, "num_tokens": 1625107405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5535546641631817, "frac_reward_zero_std": 1.0, "grad_norm": 0.03956312699545527, "kl": 0.126953125, "learning_rate": 9.883823859974408e-06, "loss": 0.0051, "num_tokens": 1625672381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5537253563198771, "frac_reward_zero_std": 1.0, "grad_norm": 0.044750551222843514, "kl": 0.11865234375, "learning_rate": 9.877866398227085e-06, "loss": 0.0047, "num_tokens": 1626238301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5538960484765725, "frac_reward_zero_std": 1.0, "grad_norm": 0.05193362170089651, "kl": 0.121337890625, "learning_rate": 9.87190897983278e-06, "loss": 0.0048, "num_tokens": 1626805197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5540667406332679, "frac_reward_zero_std": 1.0, "grad_norm": 0.06634005106109221, "kl": 0.1273193359375, "learning_rate": 9.865951606906169e-06, "loss": 0.0051, "num_tokens": 1627368781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5542374327899633, "frac_reward_zero_std": 1.0, "grad_norm": 0.09141140254294373, "kl": 0.13525390625, "learning_rate": 9.859994281561892e-06, "loss": 0.0054, "num_tokens": 1627928653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5544081249466587, "frac_reward_zero_std": 1.0, "grad_norm": 0.027376117691634855, "kl": 0.130615234375, "learning_rate": 9.854037005914596e-06, "loss": 0.0052, "num_tokens": 1628495581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5545788171033541, "frac_reward_zero_std": 1.0, "grad_norm": 0.030284996415366673, "kl": 0.1025390625, "learning_rate": 9.84807978207889e-06, "loss": 0.0041, "num_tokens": 1629057613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5547495092600495, "frac_reward_zero_std": 1.0, "grad_norm": 0.14787729946933853, "kl": 0.1312255859375, "learning_rate": 9.842122612169382e-06, "loss": 0.0052, "num_tokens": 1629620925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5549202014167449, "frac_reward_zero_std": 1.0, "grad_norm": 0.039920969432675715, "kl": 0.1142578125, "learning_rate": 9.836165498300645e-06, "loss": 0.0046, "num_tokens": 1630188477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5550908935734403, "frac_reward_zero_std": 1.0, "grad_norm": 0.2167990488245674, "kl": 0.155517578125, "learning_rate": 9.830208442587239e-06, "loss": 0.0062, "num_tokens": 1630749981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5552615857301357, "frac_reward_zero_std": 1.0, "grad_norm": 0.036650554137142265, "kl": 0.1259765625, "learning_rate": 9.82425144714371e-06, "loss": 0.005, "num_tokens": 1631315069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5554322778868311, "frac_reward_zero_std": 1.0, "grad_norm": 0.07424766105058977, "kl": 0.169677734375, "learning_rate": 9.818294514084576e-06, "loss": 0.0068, "num_tokens": 1631895261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5556029700435265, "frac_reward_zero_std": 1.0, "grad_norm": 0.05093632844358064, "kl": 0.13330078125, "learning_rate": 9.812337645524326e-06, "loss": 0.0053, "num_tokens": 1632465453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.555773662200222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0670129563706962, "kl": 0.165771484375, "learning_rate": 9.806380843577438e-06, "loss": 0.0066, "num_tokens": 1633035741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5559443543569172, "frac_reward_zero_std": 1.0, "grad_norm": 0.07073840478079546, "kl": 0.171142578125, "learning_rate": 9.800424110358362e-06, "loss": 0.0068, "num_tokens": 1633600637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5561150465136127, "frac_reward_zero_std": 1.0, "grad_norm": 0.08799770716039061, "kl": 0.1724853515625, "learning_rate": 9.794467447981525e-06, "loss": 0.0069, "num_tokens": 1634163965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5562857386703081, "frac_reward_zero_std": 1.0, "grad_norm": 0.05644570059485915, "kl": 0.1571044921875, "learning_rate": 9.788510858561323e-06, "loss": 0.0063, "num_tokens": 1634727101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5564564308270035, "frac_reward_zero_std": 1.0, "grad_norm": 0.03933737380317136, "kl": 0.134765625, "learning_rate": 9.782554344212129e-06, "loss": 0.0054, "num_tokens": 1635289869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5566271229836989, "frac_reward_zero_std": 1.0, "grad_norm": 0.033538115427472606, "kl": 0.13623046875, "learning_rate": 9.776597907048295e-06, "loss": 0.0054, "num_tokens": 1635862349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5567978151403943, "frac_reward_zero_std": 1.0, "grad_norm": 0.02893285560964285, "kl": 0.150390625, "learning_rate": 9.770641549184143e-06, "loss": 0.006, "num_tokens": 1636432269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5569685072970897, "frac_reward_zero_std": 1.0, "grad_norm": 0.026268901793076036, "kl": 0.1297607421875, "learning_rate": 9.764685272733957e-06, "loss": 0.0052, "num_tokens": 1636999501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5571391994537851, "frac_reward_zero_std": 1.0, "grad_norm": 0.04316693603739792, "kl": 0.117919921875, "learning_rate": 9.758729079812008e-06, "loss": 0.0047, "num_tokens": 1637566733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5573098916104805, "frac_reward_zero_std": 1.0, "grad_norm": 0.01418455493946408, "kl": 0.12060546875, "learning_rate": 9.752772972532524e-06, "loss": 0.0048, "num_tokens": 1638130557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5574805837671759, "frac_reward_zero_std": 1.0, "grad_norm": 0.05237547723726442, "kl": 0.1317138671875, "learning_rate": 9.746816953009715e-06, "loss": 0.0053, "num_tokens": 1638692045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5576512759238713, "frac_reward_zero_std": 1.0, "grad_norm": 0.590248532854403, "kl": 0.197509765625, "learning_rate": 9.740861023357742e-06, "loss": 0.0079, "num_tokens": 1639257645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5578219680805667, "frac_reward_zero_std": 1.0, "grad_norm": 0.03126626460055019, "kl": 0.12744140625, "learning_rate": 9.734905185690755e-06, "loss": 0.0051, "num_tokens": 1639826429.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5579926602372621, "frac_reward_zero_std": 1.0, "grad_norm": 0.05276465153799684, "kl": 0.18603515625, "learning_rate": 9.728949442122853e-06, "loss": 0.0074, "num_tokens": 1640389805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5581633523939575, "frac_reward_zero_std": 1.0, "grad_norm": 0.058651574982089845, "kl": 0.191650390625, "learning_rate": 9.722993794768123e-06, "loss": 0.0077, "num_tokens": 1640957629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5583340445506529, "frac_reward_zero_std": 1.0, "grad_norm": 0.061290270760968296, "kl": 0.252197265625, "learning_rate": 9.717038245740591e-06, "loss": 0.0101, "num_tokens": 1641537725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5585047367073483, "frac_reward_zero_std": 1.0, "grad_norm": 0.1234420446071847, "kl": 0.229736328125, "learning_rate": 9.711082797154269e-06, "loss": 0.0092, "num_tokens": 1642098989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5586754288640436, "frac_reward_zero_std": 1.0, "grad_norm": 0.05987911051573562, "kl": 0.26318359375, "learning_rate": 9.705127451123122e-06, "loss": 0.0105, "num_tokens": 1642674685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.558846121020739, "frac_reward_zero_std": 1.0, "grad_norm": 0.08491682182438587, "kl": 0.283203125, "learning_rate": 9.69917220976109e-06, "loss": 0.0113, "num_tokens": 1643246141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5590168131774345, "frac_reward_zero_std": 1.0, "grad_norm": 0.04963127438893491, "kl": 0.1884765625, "learning_rate": 9.693217075182062e-06, "loss": 0.0075, "num_tokens": 1643815277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5591875053341299, "frac_reward_zero_std": 1.0, "grad_norm": 0.04874173338192341, "kl": 0.158203125, "learning_rate": 9.687262049499898e-06, "loss": 0.0063, "num_tokens": 1644381629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5593581974908253, "frac_reward_zero_std": 1.0, "grad_norm": 0.060938295431648415, "kl": 0.1544189453125, "learning_rate": 9.681307134828415e-06, "loss": 0.0062, "num_tokens": 1644944269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5595288896475207, "frac_reward_zero_std": 1.0, "grad_norm": 0.05106041193167149, "kl": 0.14501953125, "learning_rate": 9.675352333281399e-06, "loss": 0.0058, "num_tokens": 1645504365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5596995818042161, "frac_reward_zero_std": 1.0, "grad_norm": 0.1622721807012855, "kl": 0.141357421875, "learning_rate": 9.669397646972586e-06, "loss": 0.0057, "num_tokens": 1646066781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5598702739609115, "frac_reward_zero_std": 1.0, "grad_norm": 0.03484415580286506, "kl": 0.11572265625, "learning_rate": 9.663443078015669e-06, "loss": 0.0046, "num_tokens": 1646632509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5600409661176069, "frac_reward_zero_std": 1.0, "grad_norm": 0.017925804518492027, "kl": 0.0985107421875, "learning_rate": 9.657488628524313e-06, "loss": 0.0039, "num_tokens": 1647194333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5602116582743023, "frac_reward_zero_std": 1.0, "grad_norm": 0.023933438409658968, "kl": 0.095947265625, "learning_rate": 9.651534300612134e-06, "loss": 0.0038, "num_tokens": 1647768589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5603823504309977, "frac_reward_zero_std": 1.0, "grad_norm": 0.028148257847911046, "kl": 0.0823974609375, "learning_rate": 9.645580096392695e-06, "loss": 0.0033, "num_tokens": 1648335773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5605530425876931, "frac_reward_zero_std": 1.0, "grad_norm": 0.02708475605472728, "kl": 0.0887451171875, "learning_rate": 9.639626017979526e-06, "loss": 0.0035, "num_tokens": 1648900253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5607237347443885, "frac_reward_zero_std": 1.0, "grad_norm": 0.0149708078075518, "kl": 0.0794677734375, "learning_rate": 9.633672067486116e-06, "loss": 0.0032, "num_tokens": 1649463101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5608944269010839, "frac_reward_zero_std": 1.0, "grad_norm": 0.0164061977199467, "kl": 0.0699462890625, "learning_rate": 9.627718247025897e-06, "loss": 0.0028, "num_tokens": 1650034669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5610651190577793, "frac_reward_zero_std": 1.0, "grad_norm": 0.05252673614769996, "kl": 0.103759765625, "learning_rate": 9.621764558712263e-06, "loss": 0.0042, "num_tokens": 1650603405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5612358112144747, "frac_reward_zero_std": 1.0, "grad_norm": 0.10113475105247008, "kl": 0.0921630859375, "learning_rate": 9.615811004658555e-06, "loss": 0.0037, "num_tokens": 1651171533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.56140650337117, "frac_reward_zero_std": 1.0, "grad_norm": 0.013336937424128541, "kl": 0.0836181640625, "learning_rate": 9.609857586978073e-06, "loss": 0.0033, "num_tokens": 1651740141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5615771955278654, "frac_reward_zero_std": 1.0, "grad_norm": 0.03388472146308188, "kl": 0.108642578125, "learning_rate": 9.603904307784064e-06, "loss": 0.0043, "num_tokens": 1652310157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5617478876845609, "frac_reward_zero_std": 1.0, "grad_norm": 0.021672792586972932, "kl": 0.0948486328125, "learning_rate": 9.597951169189727e-06, "loss": 0.0038, "num_tokens": 1652873597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5619185798412563, "frac_reward_zero_std": 1.0, "grad_norm": 0.023596345436115898, "kl": 0.122802734375, "learning_rate": 9.591998173308211e-06, "loss": 0.0049, "num_tokens": 1653435005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5620892719979517, "frac_reward_zero_std": 1.0, "grad_norm": 0.01909995354459555, "kl": 0.112548828125, "learning_rate": 9.586045322252617e-06, "loss": 0.0045, "num_tokens": 1653995565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5622599641546471, "frac_reward_zero_std": 1.0, "grad_norm": 0.02760592594697453, "kl": 0.109130859375, "learning_rate": 9.580092618135993e-06, "loss": 0.0044, "num_tokens": 1654561709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5624306563113425, "frac_reward_zero_std": 1.0, "grad_norm": 0.03462495085056362, "kl": 0.1297607421875, "learning_rate": 9.574140063071328e-06, "loss": 0.0052, "num_tokens": 1655125341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5626013484680379, "frac_reward_zero_std": 1.0, "grad_norm": 0.0290717610871306, "kl": 0.133056640625, "learning_rate": 9.568187659171569e-06, "loss": 0.0053, "num_tokens": 1655703021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5627720406247333, "frac_reward_zero_std": 1.0, "grad_norm": 0.03163905394084206, "kl": 0.14892578125, "learning_rate": 9.562235408549603e-06, "loss": 0.006, "num_tokens": 1656267389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5629427327814287, "frac_reward_zero_std": 1.0, "grad_norm": 0.02550373545204294, "kl": 0.10888671875, "learning_rate": 9.556283313318267e-06, "loss": 0.0044, "num_tokens": 1656830573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5631134249381241, "frac_reward_zero_std": 1.0, "grad_norm": 0.04534064266134246, "kl": 0.1466064453125, "learning_rate": 9.550331375590335e-06, "loss": 0.0059, "num_tokens": 1657393741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5632841170948195, "frac_reward_zero_std": 1.0, "grad_norm": 0.01275659504954427, "kl": 0.1025390625, "learning_rate": 9.544379597478532e-06, "loss": 0.0041, "num_tokens": 1657958029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5634548092515149, "frac_reward_zero_std": 1.0, "grad_norm": 0.16481291266022968, "kl": 0.161376953125, "learning_rate": 9.538427981095526e-06, "loss": 0.0065, "num_tokens": 1658519773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5636255014082103, "frac_reward_zero_std": 1.0, "grad_norm": 0.027764881913740602, "kl": 0.1407470703125, "learning_rate": 9.53247652855393e-06, "loss": 0.0056, "num_tokens": 1659082109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5637961935649057, "frac_reward_zero_std": 1.0, "grad_norm": 0.036954205901479, "kl": 0.1165771484375, "learning_rate": 9.52652524196628e-06, "loss": 0.0047, "num_tokens": 1659644701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5639668857216011, "frac_reward_zero_std": 1.0, "grad_norm": 0.02455819447942061, "kl": 0.1251220703125, "learning_rate": 9.520574123445085e-06, "loss": 0.005, "num_tokens": 1660216717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5641375778782965, "frac_reward_zero_std": 1.0, "grad_norm": 0.028929422353234185, "kl": 0.147216796875, "learning_rate": 9.514623175102766e-06, "loss": 0.0059, "num_tokens": 1660778477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5643082700349918, "frac_reward_zero_std": 1.0, "grad_norm": 0.01399644959089142, "kl": 0.1429443359375, "learning_rate": 9.508672399051702e-06, "loss": 0.0057, "num_tokens": 1661347757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5644789621916873, "frac_reward_zero_std": 1.0, "grad_norm": 0.01635219184003774, "kl": 0.116455078125, "learning_rate": 9.502721797404198e-06, "loss": 0.0047, "num_tokens": 1661911901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5646496543483827, "frac_reward_zero_std": 1.0, "grad_norm": 0.15657804991756127, "kl": 0.16650390625, "learning_rate": 9.496771372272506e-06, "loss": 0.0067, "num_tokens": 1662480525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5648203465050781, "frac_reward_zero_std": 1.0, "grad_norm": 0.03896781171083177, "kl": 0.12158203125, "learning_rate": 9.490821125768809e-06, "loss": 0.0049, "num_tokens": 1663043613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5649910386617735, "frac_reward_zero_std": 1.0, "grad_norm": 0.01318681320561813, "kl": 0.1175537109375, "learning_rate": 9.484871060005236e-06, "loss": 0.0047, "num_tokens": 1663607757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5651617308184689, "frac_reward_zero_std": 1.0, "grad_norm": 0.05672044498168299, "kl": 0.1531982421875, "learning_rate": 9.478921177093841e-06, "loss": 0.0061, "num_tokens": 1664174525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5653324229751643, "frac_reward_zero_std": 1.0, "grad_norm": 0.05832894617702293, "kl": 0.195068359375, "learning_rate": 9.472971479146614e-06, "loss": 0.0078, "num_tokens": 1664737389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5655031151318597, "frac_reward_zero_std": 1.0, "grad_norm": 0.08985183031900539, "kl": 0.197265625, "learning_rate": 9.46702196827549e-06, "loss": 0.0079, "num_tokens": 1665313933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5656738072885551, "frac_reward_zero_std": 1.0, "grad_norm": 0.09529403991994853, "kl": 0.197265625, "learning_rate": 9.461072646592331e-06, "loss": 0.0079, "num_tokens": 1665873629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5658444994452505, "frac_reward_zero_std": 1.0, "grad_norm": 0.08146659274921791, "kl": 0.1954345703125, "learning_rate": 9.455123516208925e-06, "loss": 0.0078, "num_tokens": 1666442413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5660151916019459, "frac_reward_zero_std": 1.0, "grad_norm": 0.04338820283264767, "kl": 0.18896484375, "learning_rate": 9.449174579237001e-06, "loss": 0.0076, "num_tokens": 1667009677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5661858837586413, "frac_reward_zero_std": 1.0, "grad_norm": 0.04415585901860641, "kl": 0.26171875, "learning_rate": 9.44322583778822e-06, "loss": 0.0105, "num_tokens": 1667572877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5663565759153367, "frac_reward_zero_std": 1.0, "grad_norm": 0.17404338990787283, "kl": 0.306396484375, "learning_rate": 9.437277293974169e-06, "loss": 0.0122, "num_tokens": 1668144301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5665272680720321, "frac_reward_zero_std": 1.0, "grad_norm": 0.11449611147376698, "kl": 0.3974609375, "learning_rate": 9.431328949906363e-06, "loss": 0.0159, "num_tokens": 1668707501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5666979602287275, "frac_reward_zero_std": 1.0, "grad_norm": 0.17963952395703583, "kl": 0.5068359375, "learning_rate": 9.425380807696251e-06, "loss": 0.0203, "num_tokens": 1669271981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5668686523854229, "frac_reward_zero_std": 1.0, "grad_norm": 0.1997931098368909, "kl": 0.568359375, "learning_rate": 9.419432869455211e-06, "loss": 0.0227, "num_tokens": 1669838813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5670393445421182, "frac_reward_zero_std": 1.0, "grad_norm": 0.12863653123045737, "kl": 0.50146484375, "learning_rate": 9.413485137294548e-06, "loss": 0.0201, "num_tokens": 1670408093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5672100366988136, "frac_reward_zero_std": 1.0, "grad_norm": 0.09352886337047324, "kl": 0.43798828125, "learning_rate": 9.407537613325485e-06, "loss": 0.0175, "num_tokens": 1670974749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.567380728855509, "frac_reward_zero_std": 1.0, "grad_norm": 0.12054126755076482, "kl": 0.50830078125, "learning_rate": 9.401590299659185e-06, "loss": 0.0203, "num_tokens": 1671537101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5675514210122045, "frac_reward_zero_std": 1.0, "grad_norm": 0.04107205290149488, "kl": 0.388671875, "learning_rate": 9.395643198406727e-06, "loss": 0.0156, "num_tokens": 1672103725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5677221131688999, "frac_reward_zero_std": 1.0, "grad_norm": 0.1925899782779875, "kl": 0.37060546875, "learning_rate": 9.389696311679119e-06, "loss": 0.0148, "num_tokens": 1672673133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5678928053255953, "frac_reward_zero_std": 1.0, "grad_norm": 0.3476788385836889, "kl": 0.3681640625, "learning_rate": 9.383749641587285e-06, "loss": 0.0147, "num_tokens": 1673246077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5680634974822907, "frac_reward_zero_std": 1.0, "grad_norm": 0.21682158367020846, "kl": 0.3994140625, "learning_rate": 9.377803190242086e-06, "loss": 0.016, "num_tokens": 1673815805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5682341896389861, "frac_reward_zero_std": 1.0, "grad_norm": 0.3462300743767269, "kl": 0.49609375, "learning_rate": 9.371856959754293e-06, "loss": 0.0199, "num_tokens": 1674376285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5684048817956815, "frac_reward_zero_std": 1.0, "grad_norm": 0.23049399094655093, "kl": 0.6455078125, "learning_rate": 9.365910952234609e-06, "loss": 0.0258, "num_tokens": 1674936093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5685755739523769, "frac_reward_zero_std": 1.0, "grad_norm": 0.2869360439537812, "kl": 0.6748046875, "learning_rate": 9.359965169793644e-06, "loss": 0.027, "num_tokens": 1675502605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5687462661090723, "frac_reward_zero_std": 0.9375, "grad_norm": 0.33150460559328204, "kl": 1.162109375, "learning_rate": 9.35401961454194e-06, "loss": 0.0465, "num_tokens": 1676064925.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 3332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5689169582657677, "frac_reward_zero_std": 1.0, "grad_norm": 1.2331242732834906, "kl": 1.595703125, "learning_rate": 9.348074288589953e-06, "loss": 0.0638, "num_tokens": 1676630717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1947.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1605.0, "completions/min_terminated_length": 0.0, "epoch": 0.5690876504224631, "frac_reward_zero_std": 1.0, "grad_norm": 0.6571923225308879, "kl": 1.705078125, "learning_rate": 9.342129194048063e-06, "loss": 0.0683, "num_tokens": 1677167789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5692583425791585, "frac_reward_zero_std": 1.0, "grad_norm": 1.7366009871091692, "kl": 1.0107421875, "learning_rate": 9.33618433302656e-06, "loss": 0.0405, "num_tokens": 1677744717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5694290347358539, "frac_reward_zero_std": 1.0, "grad_norm": 0.4840856177529929, "kl": 0.443359375, "learning_rate": 9.330239707635657e-06, "loss": 0.0178, "num_tokens": 1678313341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5695997268925493, "frac_reward_zero_std": 1.0, "grad_norm": 0.04265945363785463, "kl": 0.310791015625, "learning_rate": 9.324295319985479e-06, "loss": 0.0124, "num_tokens": 1678875613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5697704190492446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0489280976491727, "kl": 0.261474609375, "learning_rate": 9.318351172186074e-06, "loss": 0.0105, "num_tokens": 1679439501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.56994111120594, "frac_reward_zero_std": 1.0, "grad_norm": 0.036372095266990465, "kl": 0.15966796875, "learning_rate": 9.312407266347396e-06, "loss": 0.0064, "num_tokens": 1680002461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5701118033626354, "frac_reward_zero_std": 1.0, "grad_norm": 0.023673873700343618, "kl": 0.0904541015625, "learning_rate": 9.306463604579314e-06, "loss": 0.0036, "num_tokens": 1680572941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5702824955193309, "frac_reward_zero_std": 1.0, "grad_norm": 0.051008041821638754, "kl": 0.127685546875, "learning_rate": 9.30052018899162e-06, "loss": 0.0051, "num_tokens": 1681137549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5704531876760263, "frac_reward_zero_std": 1.0, "grad_norm": 0.02660157044689489, "kl": 0.09381103515625, "learning_rate": 9.294577021694013e-06, "loss": 0.0038, "num_tokens": 1681704765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5706238798327217, "frac_reward_zero_std": 1.0, "grad_norm": 0.05186860983633022, "kl": 0.12109375, "learning_rate": 9.288634104796094e-06, "loss": 0.0049, "num_tokens": 1682270605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5707945719894171, "frac_reward_zero_std": 1.0, "grad_norm": 0.06906080033667993, "kl": 0.142578125, "learning_rate": 9.282691440407387e-06, "loss": 0.0057, "num_tokens": 1682839149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5709652641461125, "frac_reward_zero_std": 1.0, "grad_norm": 0.09645250368983868, "kl": 0.1768798828125, "learning_rate": 9.27674903063733e-06, "loss": 0.0071, "num_tokens": 1683408125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5711359563028079, "frac_reward_zero_std": 1.0, "grad_norm": 0.14099361835193908, "kl": 0.21142578125, "learning_rate": 9.27080687759526e-06, "loss": 0.0084, "num_tokens": 1683976269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5713066484595033, "frac_reward_zero_std": 1.0, "grad_norm": 0.15523006624392718, "kl": 0.3037109375, "learning_rate": 9.264864983390423e-06, "loss": 0.0121, "num_tokens": 1684540813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5714773406161987, "frac_reward_zero_std": 1.0, "grad_norm": 0.07173705129909677, "kl": 0.34765625, "learning_rate": 9.25892335013198e-06, "loss": 0.0139, "num_tokens": 1685105885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5716480327728941, "frac_reward_zero_std": 1.0, "grad_norm": 0.12059080531681825, "kl": 0.41455078125, "learning_rate": 9.252981979928997e-06, "loss": 0.0166, "num_tokens": 1685676733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5718187249295895, "frac_reward_zero_std": 1.0, "grad_norm": 0.1018112926439133, "kl": 0.6025390625, "learning_rate": 9.247040874890447e-06, "loss": 0.0241, "num_tokens": 1686239565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5719894170862849, "frac_reward_zero_std": 1.0, "grad_norm": 0.09196013149820381, "kl": 0.744140625, "learning_rate": 9.241100037125205e-06, "loss": 0.0298, "num_tokens": 1686801917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5721601092429803, "frac_reward_zero_std": 1.0, "grad_norm": 0.41080259051798274, "kl": 0.8203125, "learning_rate": 9.235159468742053e-06, "loss": 0.0328, "num_tokens": 1687375565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5723308013996757, "frac_reward_zero_std": 1.0, "grad_norm": 0.16909808657224784, "kl": 0.5888671875, "learning_rate": 9.229219171849683e-06, "loss": 0.0236, "num_tokens": 1687939101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.572501493556371, "frac_reward_zero_std": 1.0, "grad_norm": 0.045667107119119026, "kl": 0.4248046875, "learning_rate": 9.223279148556685e-06, "loss": 0.017, "num_tokens": 1688504509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5726721857130664, "frac_reward_zero_std": 1.0, "grad_norm": 0.03702014113902462, "kl": 0.32275390625, "learning_rate": 9.217339400971546e-06, "loss": 0.0129, "num_tokens": 1689068269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5728428778697618, "frac_reward_zero_std": 1.0, "grad_norm": 0.05262032721407626, "kl": 0.249267578125, "learning_rate": 9.21139993120267e-06, "loss": 0.01, "num_tokens": 1689632957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5730135700264573, "frac_reward_zero_std": 1.0, "grad_norm": 0.028066279581427173, "kl": 0.207275390625, "learning_rate": 9.205460741358353e-06, "loss": 0.0083, "num_tokens": 1690204141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5731842621831527, "frac_reward_zero_std": 1.0, "grad_norm": 0.031804688984537526, "kl": 0.218017578125, "learning_rate": 9.199521833546792e-06, "loss": 0.0087, "num_tokens": 1690772765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5733549543398481, "frac_reward_zero_std": 1.0, "grad_norm": 0.02356294870272231, "kl": 0.192626953125, "learning_rate": 9.19358320987608e-06, "loss": 0.0077, "num_tokens": 1691337197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5735256464965435, "frac_reward_zero_std": 1.0, "grad_norm": 0.052522889714973216, "kl": 0.207275390625, "learning_rate": 9.187644872454222e-06, "loss": 0.0083, "num_tokens": 1691901405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5736963386532389, "frac_reward_zero_std": 1.0, "grad_norm": 0.01671401457490863, "kl": 0.223876953125, "learning_rate": 9.181706823389107e-06, "loss": 0.0089, "num_tokens": 1692473421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5738670308099343, "frac_reward_zero_std": 1.0, "grad_norm": 0.10352999810769867, "kl": 0.2802734375, "learning_rate": 9.175769064788538e-06, "loss": 0.0112, "num_tokens": 1693039933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5740377229666297, "frac_reward_zero_std": 1.0, "grad_norm": 0.05118129690876903, "kl": 0.3662109375, "learning_rate": 9.16983159876019e-06, "loss": 0.0146, "num_tokens": 1693606461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5742084151233251, "frac_reward_zero_std": 1.0, "grad_norm": 0.06485203200769675, "kl": 0.37646484375, "learning_rate": 9.163894427411662e-06, "loss": 0.0151, "num_tokens": 1694168909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5743791072800205, "frac_reward_zero_std": 1.0, "grad_norm": 0.0589082482471971, "kl": 0.42529296875, "learning_rate": 9.157957552850426e-06, "loss": 0.017, "num_tokens": 1694743469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5745497994367159, "frac_reward_zero_std": 1.0, "grad_norm": 0.030177164289243642, "kl": 0.44482421875, "learning_rate": 9.152020977183869e-06, "loss": 0.0178, "num_tokens": 1695303469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5747204915934113, "frac_reward_zero_std": 1.0, "grad_norm": 0.082797879388354, "kl": 0.43896484375, "learning_rate": 9.146084702519252e-06, "loss": 0.0175, "num_tokens": 1695863373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5748911837501067, "frac_reward_zero_std": 1.0, "grad_norm": 0.044273138915461606, "kl": 0.41796875, "learning_rate": 9.14014873096374e-06, "loss": 0.0167, "num_tokens": 1696429597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5750618759068021, "frac_reward_zero_std": 1.0, "grad_norm": 0.025583756766302763, "kl": 0.317138671875, "learning_rate": 9.134213064624387e-06, "loss": 0.0127, "num_tokens": 1696992541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5752325680634974, "frac_reward_zero_std": 1.0, "grad_norm": 0.06329529649745087, "kl": 0.271484375, "learning_rate": 9.128277705608148e-06, "loss": 0.0109, "num_tokens": 1697557517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5754032602201928, "frac_reward_zero_std": 1.0, "grad_norm": 0.04111058642622529, "kl": 0.277099609375, "learning_rate": 9.122342656021855e-06, "loss": 0.0111, "num_tokens": 1698120285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5755739523768882, "frac_reward_zero_std": 1.0, "grad_norm": 0.031036023870286693, "kl": 0.244140625, "learning_rate": 9.116407917972235e-06, "loss": 0.0098, "num_tokens": 1698687549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5757446445335836, "frac_reward_zero_std": 1.0, "grad_norm": 0.021323371772721818, "kl": 0.267578125, "learning_rate": 9.11047349356591e-06, "loss": 0.0107, "num_tokens": 1699254893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5759153366902791, "frac_reward_zero_std": 1.0, "grad_norm": 0.04208844793626701, "kl": 0.28564453125, "learning_rate": 9.104539384909387e-06, "loss": 0.0114, "num_tokens": 1699821197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5760860288469745, "frac_reward_zero_std": 1.0, "grad_norm": 0.26557110525298905, "kl": 0.3984375, "learning_rate": 9.098605594109055e-06, "loss": 0.0159, "num_tokens": 1700395693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5762567210036699, "frac_reward_zero_std": 1.0, "grad_norm": 0.1777178961354481, "kl": 0.44189453125, "learning_rate": 9.092672123271197e-06, "loss": 0.0177, "num_tokens": 1700971693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5764274131603653, "frac_reward_zero_std": 1.0, "grad_norm": 0.20477726076363895, "kl": 0.642578125, "learning_rate": 9.086738974501986e-06, "loss": 0.0257, "num_tokens": 1701537277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5765981053170607, "frac_reward_zero_std": 0.9375, "grad_norm": 0.16194587134086255, "kl": 0.591796875, "learning_rate": 9.080806149907474e-06, "loss": 0.0237, "num_tokens": 1702107229.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 3378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5767687974737561, "frac_reward_zero_std": 1.0, "grad_norm": 0.038325053842233954, "kl": 0.4052734375, "learning_rate": 9.074873651593596e-06, "loss": 0.0162, "num_tokens": 1702671101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5769394896304515, "frac_reward_zero_std": 1.0, "grad_norm": 0.061807689172106904, "kl": 0.37353515625, "learning_rate": 9.068941481666172e-06, "loss": 0.0149, "num_tokens": 1703233901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5771101817871469, "frac_reward_zero_std": 1.0, "grad_norm": 0.08779080883341145, "kl": 0.28173828125, "learning_rate": 9.063009642230917e-06, "loss": 0.0113, "num_tokens": 1703804557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5772808739438423, "frac_reward_zero_std": 1.0, "grad_norm": 0.07875500510905661, "kl": 0.34326171875, "learning_rate": 9.057078135393417e-06, "loss": 0.0137, "num_tokens": 1704364141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5774515661005377, "frac_reward_zero_std": 1.0, "grad_norm": 0.07924213433941345, "kl": 0.310546875, "learning_rate": 9.051146963259137e-06, "loss": 0.0124, "num_tokens": 1704926509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5776222582572331, "frac_reward_zero_std": 1.0, "grad_norm": 0.0608127351216401, "kl": 0.412109375, "learning_rate": 9.045216127933435e-06, "loss": 0.0165, "num_tokens": 1705489005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5777929504139285, "frac_reward_zero_std": 1.0, "grad_norm": 0.047810008821135404, "kl": 0.44775390625, "learning_rate": 9.039285631521544e-06, "loss": 0.0179, "num_tokens": 1706053357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5779636425706238, "frac_reward_zero_std": 1.0, "grad_norm": 0.0523639355022822, "kl": 0.5830078125, "learning_rate": 9.033355476128574e-06, "loss": 0.0234, "num_tokens": 1706616573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5781343347273192, "frac_reward_zero_std": 1.0, "grad_norm": 0.1286195082746766, "kl": 0.6552734375, "learning_rate": 9.027425663859514e-06, "loss": 0.0262, "num_tokens": 1707187085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5783050268840146, "frac_reward_zero_std": 1.0, "grad_norm": 0.1305729869166492, "kl": 0.52490234375, "learning_rate": 9.021496196819236e-06, "loss": 0.021, "num_tokens": 1707755885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.57847571904071, "frac_reward_zero_std": 1.0, "grad_norm": 0.250023176330381, "kl": 0.51171875, "learning_rate": 9.015567077112484e-06, "loss": 0.0204, "num_tokens": 1708317565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5786464111974055, "frac_reward_zero_std": 1.0, "grad_norm": 0.23536132705813406, "kl": 0.4091796875, "learning_rate": 9.00963830684389e-06, "loss": 0.0164, "num_tokens": 1708888189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5788171033541009, "frac_reward_zero_std": 1.0, "grad_norm": 0.041384529458188456, "kl": 0.45263671875, "learning_rate": 9.003709888117943e-06, "loss": 0.0181, "num_tokens": 1709450045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5789877955107963, "frac_reward_zero_std": 1.0, "grad_norm": 0.1044529425161996, "kl": 0.52587890625, "learning_rate": 8.99778182303902e-06, "loss": 0.0211, "num_tokens": 1710013053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5791584876674917, "frac_reward_zero_std": 1.0, "grad_norm": 0.10421018425438512, "kl": 0.548828125, "learning_rate": 8.991854113711371e-06, "loss": 0.022, "num_tokens": 1710582541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5793291798241871, "frac_reward_zero_std": 1.0, "grad_norm": 0.17478246592632696, "kl": 0.9013671875, "learning_rate": 8.985926762239126e-06, "loss": 0.0361, "num_tokens": 1711151965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5794998719808825, "frac_reward_zero_std": 1.0, "grad_norm": 1.0816979107308338, "kl": 1.615234375, "learning_rate": 8.979999770726269e-06, "loss": 0.0645, "num_tokens": 1711726029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5796705641375779, "frac_reward_zero_std": 1.0, "grad_norm": 0.5837326007065223, "kl": 1.591796875, "learning_rate": 8.974073141276677e-06, "loss": 0.0636, "num_tokens": 1712287037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5798412562942733, "frac_reward_zero_std": 1.0, "grad_norm": 0.4111211464232526, "kl": 1.671875, "learning_rate": 8.968146875994082e-06, "loss": 0.0669, "num_tokens": 1712847965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5800119484509687, "frac_reward_zero_std": 1.0, "grad_norm": 0.36512022054719767, "kl": 1.751953125, "learning_rate": 8.962220976982103e-06, "loss": 0.0702, "num_tokens": 1713409533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5801826406076641, "frac_reward_zero_std": 1.0, "grad_norm": 1.425118361767583, "kl": 1.7265625, "learning_rate": 8.956295446344212e-06, "loss": 0.069, "num_tokens": 1713981245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5803533327643595, "frac_reward_zero_std": 1.0, "grad_norm": 0.8962021085728158, "kl": 1.486328125, "learning_rate": 8.950370286183762e-06, "loss": 0.0595, "num_tokens": 1714552349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2010.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 1746.0, "completions/min_terminated_length": 0.0, "epoch": 0.5805240249210549, "frac_reward_zero_std": 1.0, "grad_norm": 0.4426463629634506, "kl": 1.341796875, "learning_rate": 8.944445498603976e-06, "loss": 0.0537, "num_tokens": 1715105133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1981.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1582.0, "completions/min_terminated_length": 0.0, "epoch": 0.5806947170777502, "frac_reward_zero_std": 1.0, "grad_norm": 0.7798703382437403, "kl": 1.33203125, "learning_rate": 8.938521085707927e-06, "loss": 0.0533, "num_tokens": 1715655565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2003.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 1865.0, "completions/min_terminated_length": 0.0, "epoch": 0.5808654092344456, "frac_reward_zero_std": 1.0, "grad_norm": 0.7048797579657191, "kl": 1.236328125, "learning_rate": 8.93259704959858e-06, "loss": 0.0495, "num_tokens": 1716208861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.581036101391141, "frac_reward_zero_std": 1.0, "grad_norm": 0.408621652306988, "kl": 1.205078125, "learning_rate": 8.926673392378746e-06, "loss": 0.0482, "num_tokens": 1716768765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5812067935478364, "frac_reward_zero_std": 1.0, "grad_norm": 0.8525536564255282, "kl": 1.0625, "learning_rate": 8.92075011615112e-06, "loss": 0.0425, "num_tokens": 1717333133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5813774857045318, "frac_reward_zero_std": 1.0, "grad_norm": 0.6799960885500741, "kl": 0.849609375, "learning_rate": 8.914827223018236e-06, "loss": 0.034, "num_tokens": 1717899453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5815481778612273, "frac_reward_zero_std": 1.0, "grad_norm": 0.38821557453674216, "kl": 0.515625, "learning_rate": 8.908904715082519e-06, "loss": 0.0206, "num_tokens": 1718465005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5817188700179227, "frac_reward_zero_std": 1.0, "grad_norm": 0.3034960120154006, "kl": 0.27880859375, "learning_rate": 8.90298259444624e-06, "loss": 0.0112, "num_tokens": 1719026573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5818895621746181, "frac_reward_zero_std": 1.0, "grad_norm": 0.2162276367447004, "kl": 0.13427734375, "learning_rate": 8.897060863211545e-06, "loss": 0.0054, "num_tokens": 1719591213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5820602543313135, "frac_reward_zero_std": 1.0, "grad_norm": 0.11008940030341378, "kl": 0.07086181640625, "learning_rate": 8.891139523480424e-06, "loss": 0.0028, "num_tokens": 1720156365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5822309464880089, "frac_reward_zero_std": 1.0, "grad_norm": 0.05605038925062037, "kl": 0.04644775390625, "learning_rate": 8.885218577354747e-06, "loss": 0.0019, "num_tokens": 1720725869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5824016386447043, "frac_reward_zero_std": 1.0, "grad_norm": 0.040502664351703174, "kl": 0.0428466796875, "learning_rate": 8.879298026936232e-06, "loss": 0.0017, "num_tokens": 1721290285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5825723308013997, "frac_reward_zero_std": 1.0, "grad_norm": 0.01756221235058358, "kl": 0.035400390625, "learning_rate": 8.873377874326466e-06, "loss": 0.0014, "num_tokens": 1721860781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5827430229580951, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019474573043291301, "kl": 0.0281982421875, "learning_rate": 8.867458121626883e-06, "loss": 0.0011, "num_tokens": 1722423693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5829137151147905, "frac_reward_zero_std": 1.0, "grad_norm": 0.006159980149143385, "kl": 0.02716064453125, "learning_rate": 8.861538770938784e-06, "loss": 0.0011, "num_tokens": 1722991949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5830844072714859, "frac_reward_zero_std": 1.0, "grad_norm": 0.07855517453044321, "kl": 0.0372314453125, "learning_rate": 8.855619824363326e-06, "loss": 0.0015, "num_tokens": 1723560125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5832550994281813, "frac_reward_zero_std": 1.0, "grad_norm": 0.006584878488135639, "kl": 0.025787353515625, "learning_rate": 8.849701284001523e-06, "loss": 0.001, "num_tokens": 1724130013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5834257915848767, "frac_reward_zero_std": 1.0, "grad_norm": 0.023434779118785713, "kl": 0.02679443359375, "learning_rate": 8.84378315195424e-06, "loss": 0.0011, "num_tokens": 1724697709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.583596483741572, "frac_reward_zero_std": 1.0, "grad_norm": 0.00043422482825291014, "kl": 0.02301025390625, "learning_rate": 8.837865430322197e-06, "loss": 0.0009, "num_tokens": 1725262589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5837671758982674, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004976016778751789, "kl": 0.0213623046875, "learning_rate": 8.831948121205979e-06, "loss": 0.0009, "num_tokens": 1725828669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5839378680549628, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011699067581987588, "kl": 0.021697998046875, "learning_rate": 8.826031226706015e-06, "loss": 0.0009, "num_tokens": 1726400685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5841085602116582, "frac_reward_zero_std": 1.0, "grad_norm": 0.01844123685852558, "kl": 0.023468017578125, "learning_rate": 8.820114748922585e-06, "loss": 0.0009, "num_tokens": 1726965309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5842792523683537, "frac_reward_zero_std": 1.0, "grad_norm": 0.04818536033467353, "kl": 0.026458740234375, "learning_rate": 8.814198689955828e-06, "loss": 0.0011, "num_tokens": 1727527661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5844499445250491, "frac_reward_zero_std": 1.0, "grad_norm": 0.004011358593809377, "kl": 0.020599365234375, "learning_rate": 8.80828305190573e-06, "loss": 0.0008, "num_tokens": 1728090125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5846206366817445, "frac_reward_zero_std": 1.0, "grad_norm": 0.000421903030876626, "kl": 0.02032470703125, "learning_rate": 8.802367836872135e-06, "loss": 0.0008, "num_tokens": 1728653469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5847913288384399, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007540993593545042, "kl": 0.01922607421875, "learning_rate": 8.79645304695472e-06, "loss": 0.0008, "num_tokens": 1729246701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5849620209951353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005735059184989649, "kl": 0.020751953125, "learning_rate": 8.790538684253031e-06, "loss": 0.0008, "num_tokens": 1729816013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5851327131518307, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012641992921579928, "kl": 0.02008056640625, "learning_rate": 8.78462475086645e-06, "loss": 0.0008, "num_tokens": 1730379421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5853034053085261, "frac_reward_zero_std": 1.0, "grad_norm": 0.007555186861768709, "kl": 0.01971435546875, "learning_rate": 8.77871124889421e-06, "loss": 0.0008, "num_tokens": 1730947901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5854740974652215, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037847598487673203, "kl": 0.020355224609375, "learning_rate": 8.772798180435388e-06, "loss": 0.0008, "num_tokens": 1731505997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5856447896219169, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010347713735129333, "kl": 0.019134521484375, "learning_rate": 8.766885547588914e-06, "loss": 0.0008, "num_tokens": 1732072317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5858154817786123, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038313228332651082, "kl": 0.02032470703125, "learning_rate": 8.760973352453556e-06, "loss": 0.0008, "num_tokens": 1732635773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5859861739353077, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011878482399376273, "kl": 0.019317626953125, "learning_rate": 8.75506159712794e-06, "loss": 0.0008, "num_tokens": 1733204525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5861568660920031, "frac_reward_zero_std": 1.0, "grad_norm": 0.006517309968920903, "kl": 0.02044677734375, "learning_rate": 8.749150283710508e-06, "loss": 0.0008, "num_tokens": 1733767949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5863275582486984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006814354229159181, "kl": 0.01959228515625, "learning_rate": 8.743239414299576e-06, "loss": 0.0008, "num_tokens": 1734329933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5864982504053938, "frac_reward_zero_std": 1.0, "grad_norm": 0.00045748270381390423, "kl": 0.0198974609375, "learning_rate": 8.737328990993283e-06, "loss": 0.0008, "num_tokens": 1734896701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5866689425620892, "frac_reward_zero_std": 1.0, "grad_norm": 0.003645825436522605, "kl": 0.01959228515625, "learning_rate": 8.731419015889621e-06, "loss": 0.0008, "num_tokens": 1735462333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5868396347187846, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023169436743988884, "kl": 0.01983642578125, "learning_rate": 8.725509491086414e-06, "loss": 0.0008, "num_tokens": 1736025405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.58701032687548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006209956543849217, "kl": 0.019989013671875, "learning_rate": 8.719600418681334e-06, "loss": 0.0008, "num_tokens": 1736587149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5871810190321755, "frac_reward_zero_std": 1.0, "grad_norm": 0.003429576682350043, "kl": 0.019378662109375, "learning_rate": 8.71369180077188e-06, "loss": 0.0008, "num_tokens": 1737152493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5873517111888709, "frac_reward_zero_std": 1.0, "grad_norm": 0.010189606709008814, "kl": 0.021331787109375, "learning_rate": 8.70778363945541e-06, "loss": 0.0009, "num_tokens": 1737714781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5875224033455663, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009695691149216258, "kl": 0.020050048828125, "learning_rate": 8.701875936829097e-06, "loss": 0.0008, "num_tokens": 1738278861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5876930955022617, "frac_reward_zero_std": 1.0, "grad_norm": 0.045000982378378955, "kl": 0.026153564453125, "learning_rate": 8.695968694989968e-06, "loss": 0.001, "num_tokens": 1738846861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5878637876589571, "frac_reward_zero_std": 1.0, "grad_norm": 0.005735489043556178, "kl": 0.020172119140625, "learning_rate": 8.690061916034877e-06, "loss": 0.0008, "num_tokens": 1739412301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5880344798156525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011435120101947818, "kl": 0.019744873046875, "learning_rate": 8.684155602060527e-06, "loss": 0.0008, "num_tokens": 1739983421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5882051719723479, "frac_reward_zero_std": 1.0, "grad_norm": 0.00045670697596006903, "kl": 0.02105712890625, "learning_rate": 8.678249755163434e-06, "loss": 0.0008, "num_tokens": 1740553949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5883758641290433, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006578602662914305, "kl": 0.020843505859375, "learning_rate": 8.672344377439964e-06, "loss": 0.0008, "num_tokens": 1741114669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5885465562857387, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013132403053148088, "kl": 0.021240234375, "learning_rate": 8.666439470986316e-06, "loss": 0.0008, "num_tokens": 1741698445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5887172484424341, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008721938946578298, "kl": 0.02099609375, "learning_rate": 8.66053503789852e-06, "loss": 0.0008, "num_tokens": 1742257853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5888879405991295, "frac_reward_zero_std": 1.0, "grad_norm": 0.011252029911873107, "kl": 0.02203369140625, "learning_rate": 8.654631080272431e-06, "loss": 0.0009, "num_tokens": 1742823197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5890586327558248, "frac_reward_zero_std": 1.0, "grad_norm": 0.015222850576722517, "kl": 0.023345947265625, "learning_rate": 8.648727600203741e-06, "loss": 0.0009, "num_tokens": 1743389837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5892293249125202, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011261099114715128, "kl": 0.021484375, "learning_rate": 8.642824599787977e-06, "loss": 0.0009, "num_tokens": 1743954877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5894000170692156, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002925407701395847, "kl": 0.021331787109375, "learning_rate": 8.636922081120493e-06, "loss": 0.0009, "num_tokens": 1744526221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.589570709225911, "frac_reward_zero_std": 1.0, "grad_norm": 0.007871906526635495, "kl": 0.0224609375, "learning_rate": 8.631020046296462e-06, "loss": 0.0009, "num_tokens": 1745088733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5897414013826064, "frac_reward_zero_std": 1.0, "grad_norm": 0.12531675024494124, "kl": 0.038787841796875, "learning_rate": 8.625118497410895e-06, "loss": 0.0016, "num_tokens": 1745653805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5899120935393019, "frac_reward_zero_std": 1.0, "grad_norm": 0.005177464880882737, "kl": 0.0228271484375, "learning_rate": 8.619217436558633e-06, "loss": 0.0009, "num_tokens": 1746212061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5900827856959973, "frac_reward_zero_std": 1.0, "grad_norm": 0.004350601727210213, "kl": 0.023101806640625, "learning_rate": 8.613316865834341e-06, "loss": 0.0009, "num_tokens": 1746784829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5902534778526927, "frac_reward_zero_std": 1.0, "grad_norm": 0.003819368219624339, "kl": 0.0235595703125, "learning_rate": 8.6074167873325e-06, "loss": 0.0009, "num_tokens": 1747356365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5904241700093881, "frac_reward_zero_std": 1.0, "grad_norm": 0.008238708220829, "kl": 0.024627685546875, "learning_rate": 8.60151720314743e-06, "loss": 0.001, "num_tokens": 1747919677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5905948621660835, "frac_reward_zero_std": 1.0, "grad_norm": 0.018502248818112913, "kl": 0.0260009765625, "learning_rate": 8.595618115373276e-06, "loss": 0.001, "num_tokens": 1748489085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5907655543227789, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012574919380621862, "kl": 0.02423095703125, "learning_rate": 8.589719526103995e-06, "loss": 0.001, "num_tokens": 1749053325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5909362464794743, "frac_reward_zero_std": 1.0, "grad_norm": 0.01137397112132031, "kl": 0.02667236328125, "learning_rate": 8.583821437433369e-06, "loss": 0.0011, "num_tokens": 1749619037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5911069386361697, "frac_reward_zero_std": 1.0, "grad_norm": 0.22520928653405944, "kl": 0.053253173828125, "learning_rate": 8.577923851455014e-06, "loss": 0.0021, "num_tokens": 1750188909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5912776307928651, "frac_reward_zero_std": 1.0, "grad_norm": 0.07288927374697834, "kl": 0.035919189453125, "learning_rate": 8.572026770262356e-06, "loss": 0.0014, "num_tokens": 1750753005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5914483229495605, "frac_reward_zero_std": 1.0, "grad_norm": 0.2972164777186996, "kl": 0.06695556640625, "learning_rate": 8.566130195948654e-06, "loss": 0.0027, "num_tokens": 1751317885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5916190151062559, "frac_reward_zero_std": 1.0, "grad_norm": 0.012351697768539097, "kl": 0.0286865234375, "learning_rate": 8.560234130606967e-06, "loss": 0.0011, "num_tokens": 1751882621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5917897072629512, "frac_reward_zero_std": 1.0, "grad_norm": 0.016473841159550792, "kl": 0.03094482421875, "learning_rate": 8.554338576330192e-06, "loss": 0.0012, "num_tokens": 1752449117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5919603994196466, "frac_reward_zero_std": 1.0, "grad_norm": 0.007424926087218865, "kl": 0.03173828125, "learning_rate": 8.548443535211034e-06, "loss": 0.0013, "num_tokens": 1753015933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.592131091576342, "frac_reward_zero_std": 1.0, "grad_norm": 0.018317074230985393, "kl": 0.03363037109375, "learning_rate": 8.54254900934203e-06, "loss": 0.0013, "num_tokens": 1753578365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5923017837330374, "frac_reward_zero_std": 1.0, "grad_norm": 0.014867999559099142, "kl": 0.0347900390625, "learning_rate": 8.536655000815514e-06, "loss": 0.0014, "num_tokens": 1754141389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5924724758897328, "frac_reward_zero_std": 1.0, "grad_norm": 0.03427180660641981, "kl": 0.04107666015625, "learning_rate": 8.530761511723647e-06, "loss": 0.0016, "num_tokens": 1754705485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5926431680464282, "frac_reward_zero_std": 1.0, "grad_norm": 0.016610272136615232, "kl": 0.03741455078125, "learning_rate": 8.524868544158407e-06, "loss": 0.0015, "num_tokens": 1755269517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5928138602031237, "frac_reward_zero_std": 1.0, "grad_norm": 0.02444877848710744, "kl": 0.05633544921875, "learning_rate": 8.518976100211587e-06, "loss": 0.0022, "num_tokens": 1755831981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5929845523598191, "frac_reward_zero_std": 1.0, "grad_norm": 0.019719317560557365, "kl": 0.04119873046875, "learning_rate": 8.51308418197479e-06, "loss": 0.0016, "num_tokens": 1756394141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5931552445165145, "frac_reward_zero_std": 1.0, "grad_norm": 0.014987674680211402, "kl": 0.04815673828125, "learning_rate": 8.507192791539428e-06, "loss": 0.0019, "num_tokens": 1756957693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5933259366732099, "frac_reward_zero_std": 1.0, "grad_norm": 0.023852224665281866, "kl": 0.04559326171875, "learning_rate": 8.501301930996742e-06, "loss": 0.0018, "num_tokens": 1757523117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5934966288299053, "frac_reward_zero_std": 1.0, "grad_norm": 0.028442595870061184, "kl": 0.08160400390625, "learning_rate": 8.495411602437772e-06, "loss": 0.0033, "num_tokens": 1758089693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5936673209866007, "frac_reward_zero_std": 1.0, "grad_norm": 0.037393771141503246, "kl": 0.0570068359375, "learning_rate": 8.48952180795337e-06, "loss": 0.0023, "num_tokens": 1758651933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5938380131432961, "frac_reward_zero_std": 1.0, "grad_norm": 0.042770523237740096, "kl": 0.064697265625, "learning_rate": 8.483632549634198e-06, "loss": 0.0026, "num_tokens": 1759215261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5940087052999915, "frac_reward_zero_std": 1.0, "grad_norm": 0.06103031023743991, "kl": 0.081787109375, "learning_rate": 8.477743829570734e-06, "loss": 0.0033, "num_tokens": 1759784141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5941793974566869, "frac_reward_zero_std": 1.0, "grad_norm": 0.06550335717977898, "kl": 0.08709716796875, "learning_rate": 8.471855649853265e-06, "loss": 0.0035, "num_tokens": 1760350685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5943500896133823, "frac_reward_zero_std": 1.0, "grad_norm": 0.07528956647327253, "kl": 0.07330322265625, "learning_rate": 8.465968012571874e-06, "loss": 0.0029, "num_tokens": 1760915197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5945207817700776, "frac_reward_zero_std": 1.0, "grad_norm": 0.07286399298913697, "kl": 0.09112548828125, "learning_rate": 8.46008091981646e-06, "loss": 0.0036, "num_tokens": 1761486557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.594691473926773, "frac_reward_zero_std": 1.0, "grad_norm": 0.06315500846056066, "kl": 0.06781005859375, "learning_rate": 8.454194373676735e-06, "loss": 0.0027, "num_tokens": 1762048493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5948621660834684, "frac_reward_zero_std": 1.0, "grad_norm": 0.045968386436260236, "kl": 0.0821533203125, "learning_rate": 8.44830837624221e-06, "loss": 0.0033, "num_tokens": 1762606285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5950328582401638, "frac_reward_zero_std": 1.0, "grad_norm": 0.04344482467960352, "kl": 0.0784912109375, "learning_rate": 8.442422929602192e-06, "loss": 0.0031, "num_tokens": 1763169581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5952035503968592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03978118050582405, "kl": 0.07000732421875, "learning_rate": 8.436538035845811e-06, "loss": 0.0028, "num_tokens": 1763744557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5953742425535546, "frac_reward_zero_std": 1.0, "grad_norm": 0.022750308466162716, "kl": 0.0465087890625, "learning_rate": 8.430653697061991e-06, "loss": 0.0019, "num_tokens": 1764307581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.59554493471025, "frac_reward_zero_std": 1.0, "grad_norm": 0.012748312687892945, "kl": 0.05523681640625, "learning_rate": 8.42476991533946e-06, "loss": 0.0022, "num_tokens": 1764877885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5957156268669455, "frac_reward_zero_std": 1.0, "grad_norm": 0.01133257951787835, "kl": 0.06500244140625, "learning_rate": 8.418886692766743e-06, "loss": 0.0026, "num_tokens": 1765441709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5958863190236409, "frac_reward_zero_std": 1.0, "grad_norm": 0.007506716778066436, "kl": 0.0428466796875, "learning_rate": 8.413004031432176e-06, "loss": 0.0017, "num_tokens": 1766009293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5960570111803363, "frac_reward_zero_std": 1.0, "grad_norm": 0.026584731954469528, "kl": 0.07354736328125, "learning_rate": 8.407121933423891e-06, "loss": 0.0029, "num_tokens": 1766572029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5962277033370317, "frac_reward_zero_std": 1.0, "grad_norm": 0.017535432217182987, "kl": 0.06390380859375, "learning_rate": 8.401240400829822e-06, "loss": 0.0026, "num_tokens": 1767137085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5963983954937271, "frac_reward_zero_std": 1.0, "grad_norm": 0.02146289596781968, "kl": 0.05743408203125, "learning_rate": 8.395359435737694e-06, "loss": 0.0023, "num_tokens": 1767705501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5965690876504225, "frac_reward_zero_std": 1.0, "grad_norm": 0.021849198036426827, "kl": 0.048583984375, "learning_rate": 8.389479040235045e-06, "loss": 0.0019, "num_tokens": 1768272285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5967397798071179, "frac_reward_zero_std": 1.0, "grad_norm": 0.02175120910552856, "kl": 0.06365966796875, "learning_rate": 8.383599216409198e-06, "loss": 0.0025, "num_tokens": 1768835661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5969104719638133, "frac_reward_zero_std": 1.0, "grad_norm": 0.036141314801325715, "kl": 0.06549072265625, "learning_rate": 8.377719966347287e-06, "loss": 0.0026, "num_tokens": 1769400701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5970811641205087, "frac_reward_zero_std": 1.0, "grad_norm": 0.018932448551585792, "kl": 0.04388427734375, "learning_rate": 8.371841292136221e-06, "loss": 0.0018, "num_tokens": 1769963005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.597251856277204, "frac_reward_zero_std": 1.0, "grad_norm": 0.02503713218506837, "kl": 0.06585693359375, "learning_rate": 8.365963195862725e-06, "loss": 0.0026, "num_tokens": 1770524045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5974225484338994, "frac_reward_zero_std": 1.0, "grad_norm": 0.01895362660978104, "kl": 0.0606689453125, "learning_rate": 8.360085679613306e-06, "loss": 0.0024, "num_tokens": 1771090109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5975932405905948, "frac_reward_zero_std": 1.0, "grad_norm": 0.030343474534662605, "kl": 0.0723876953125, "learning_rate": 8.354208745474279e-06, "loss": 0.0029, "num_tokens": 1771650205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5977639327472902, "frac_reward_zero_std": 1.0, "grad_norm": 0.026389144150600483, "kl": 0.06396484375, "learning_rate": 8.348332395531735e-06, "loss": 0.0026, "num_tokens": 1772216925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5979346249039856, "frac_reward_zero_std": 1.0, "grad_norm": 0.024911399209525998, "kl": 0.0604248046875, "learning_rate": 8.342456631871569e-06, "loss": 0.0024, "num_tokens": 1772782173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.598105317060681, "frac_reward_zero_std": 1.0, "grad_norm": 0.04152616693644646, "kl": 0.08453369140625, "learning_rate": 8.336581456579462e-06, "loss": 0.0034, "num_tokens": 1773342653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5982760092173764, "frac_reward_zero_std": 1.0, "grad_norm": 0.023010946326614646, "kl": 0.06890869140625, "learning_rate": 8.330706871740896e-06, "loss": 0.0028, "num_tokens": 1773920845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5984467013740719, "frac_reward_zero_std": 1.0, "grad_norm": 0.02534870582317805, "kl": 0.0709228515625, "learning_rate": 8.324832879441132e-06, "loss": 0.0028, "num_tokens": 1774484973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5986173935307673, "frac_reward_zero_std": 1.0, "grad_norm": 0.01485938305549246, "kl": 0.055419921875, "learning_rate": 8.318959481765222e-06, "loss": 0.0022, "num_tokens": 1775048701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5987880856874627, "frac_reward_zero_std": 1.0, "grad_norm": 0.014774221279408606, "kl": 0.05810546875, "learning_rate": 8.313086680798017e-06, "loss": 0.0023, "num_tokens": 1775610589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5989587778441581, "frac_reward_zero_std": 1.0, "grad_norm": 0.017214226007501374, "kl": 0.05438232421875, "learning_rate": 8.307214478624148e-06, "loss": 0.0022, "num_tokens": 1776176925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5991294700008535, "frac_reward_zero_std": 1.0, "grad_norm": 0.03769941948284388, "kl": 0.06817626953125, "learning_rate": 8.301342877328032e-06, "loss": 0.0027, "num_tokens": 1776749197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5993001621575489, "frac_reward_zero_std": 1.0, "grad_norm": 0.03277952971940315, "kl": 0.069091796875, "learning_rate": 8.295471878993873e-06, "loss": 0.0028, "num_tokens": 1777318349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5994708543142443, "frac_reward_zero_std": 1.0, "grad_norm": 0.17384351310596838, "kl": 0.1182861328125, "learning_rate": 8.289601485705669e-06, "loss": 0.0047, "num_tokens": 1777884189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5996415464709397, "frac_reward_zero_std": 1.0, "grad_norm": 0.028069218807549694, "kl": 0.0684814453125, "learning_rate": 8.283731699547198e-06, "loss": 0.0027, "num_tokens": 1778450941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5998122386276351, "frac_reward_zero_std": 1.0, "grad_norm": 0.05057747488878002, "kl": 0.07672119140625, "learning_rate": 8.277862522602019e-06, "loss": 0.0031, "num_tokens": 1779022653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5999829307843305, "frac_reward_zero_std": 1.0, "grad_norm": 0.060842848144809704, "kl": 0.0953369140625, "learning_rate": 8.271993956953474e-06, "loss": 0.0038, "num_tokens": 1779590701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6001536229410258, "frac_reward_zero_std": 1.0, "grad_norm": 0.07940203948662099, "kl": 0.1029052734375, "learning_rate": 8.266126004684702e-06, "loss": 0.0041, "num_tokens": 1780154989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6003243150977212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0588456236139423, "kl": 0.11737060546875, "learning_rate": 8.26025866787861e-06, "loss": 0.0047, "num_tokens": 1780715789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6004950072544166, "frac_reward_zero_std": 1.0, "grad_norm": 0.047489197879489554, "kl": 0.09521484375, "learning_rate": 8.254391948617885e-06, "loss": 0.0038, "num_tokens": 1781278637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.600665699411112, "frac_reward_zero_std": 1.0, "grad_norm": 0.03878592476987471, "kl": 0.0845947265625, "learning_rate": 8.24852584898501e-06, "loss": 0.0034, "num_tokens": 1781846397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6008363915678074, "frac_reward_zero_std": 1.0, "grad_norm": 0.04186698899570539, "kl": 0.0787353515625, "learning_rate": 8.242660371062231e-06, "loss": 0.0032, "num_tokens": 1782413277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6010070837245028, "frac_reward_zero_std": 1.0, "grad_norm": 0.04425620136911284, "kl": 0.114013671875, "learning_rate": 8.236795516931587e-06, "loss": 0.0046, "num_tokens": 1782975821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6011777758811983, "frac_reward_zero_std": 1.0, "grad_norm": 0.02659690569973037, "kl": 0.09716796875, "learning_rate": 8.230931288674882e-06, "loss": 0.0039, "num_tokens": 1783543405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6013484680378937, "frac_reward_zero_std": 1.0, "grad_norm": 0.022461955439676006, "kl": 0.087890625, "learning_rate": 8.225067688373713e-06, "loss": 0.0035, "num_tokens": 1784108077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6015191601945891, "frac_reward_zero_std": 1.0, "grad_norm": 0.027400195200158843, "kl": 0.1292724609375, "learning_rate": 8.219204718109442e-06, "loss": 0.0052, "num_tokens": 1784672605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6016898523512845, "frac_reward_zero_std": 1.0, "grad_norm": 0.016035694067059007, "kl": 0.0970458984375, "learning_rate": 8.213342379963217e-06, "loss": 0.0039, "num_tokens": 1785234621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6018605445079799, "frac_reward_zero_std": 1.0, "grad_norm": 0.05461826988360048, "kl": 0.1224365234375, "learning_rate": 8.207480676015946e-06, "loss": 0.0049, "num_tokens": 1785797485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6020312366646753, "frac_reward_zero_std": 1.0, "grad_norm": 0.02595778851982535, "kl": 0.0767822265625, "learning_rate": 8.201619608348332e-06, "loss": 0.0031, "num_tokens": 1786363725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6022019288213707, "frac_reward_zero_std": 1.0, "grad_norm": 0.01010133522180104, "kl": 0.0909423828125, "learning_rate": 8.195759179040839e-06, "loss": 0.0036, "num_tokens": 1786933421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6023726209780661, "frac_reward_zero_std": 1.0, "grad_norm": 0.04601880767874019, "kl": 0.1048583984375, "learning_rate": 8.18989939017371e-06, "loss": 0.0042, "num_tokens": 1787497581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6025433131347615, "frac_reward_zero_std": 1.0, "grad_norm": 0.05500566991572508, "kl": 0.1090087890625, "learning_rate": 8.184040243826954e-06, "loss": 0.0044, "num_tokens": 1788064557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6027140052914569, "frac_reward_zero_std": 1.0, "grad_norm": 0.05754494692321557, "kl": 0.1356201171875, "learning_rate": 8.17818174208036e-06, "loss": 0.0054, "num_tokens": 1788628541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6028846974481522, "frac_reward_zero_std": 1.0, "grad_norm": 0.018401852078516665, "kl": 0.08984375, "learning_rate": 8.172323887013483e-06, "loss": 0.0036, "num_tokens": 1789193437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6030553896048476, "frac_reward_zero_std": 1.0, "grad_norm": 0.06465848738272864, "kl": 0.1041259765625, "learning_rate": 8.166466680705652e-06, "loss": 0.0042, "num_tokens": 1789761965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.603226081761543, "frac_reward_zero_std": 1.0, "grad_norm": 0.02408547635524993, "kl": 0.12890625, "learning_rate": 8.160610125235963e-06, "loss": 0.0052, "num_tokens": 1790326429.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6033967739182384, "frac_reward_zero_std": 1.0, "grad_norm": 0.06576584719059679, "kl": 0.145263671875, "learning_rate": 8.154754222683279e-06, "loss": 0.0058, "num_tokens": 1790897933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6035674660749338, "frac_reward_zero_std": 1.0, "grad_norm": 0.025151088734980055, "kl": 0.1248779296875, "learning_rate": 8.148898975126237e-06, "loss": 0.005, "num_tokens": 1791461645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6037381582316292, "frac_reward_zero_std": 1.0, "grad_norm": 0.03390085823969712, "kl": 0.1707763671875, "learning_rate": 8.143044384643242e-06, "loss": 0.0068, "num_tokens": 1792024685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6039088503883246, "frac_reward_zero_std": 1.0, "grad_norm": 0.018500583680444403, "kl": 0.123779296875, "learning_rate": 8.137190453312453e-06, "loss": 0.005, "num_tokens": 1792587837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.60407954254502, "frac_reward_zero_std": 1.0, "grad_norm": 0.02842857314783605, "kl": 0.178955078125, "learning_rate": 8.131337183211811e-06, "loss": 0.0072, "num_tokens": 1793150669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6042502347017155, "frac_reward_zero_std": 1.0, "grad_norm": 0.016717271846116964, "kl": 0.1572265625, "learning_rate": 8.125484576419014e-06, "loss": 0.0063, "num_tokens": 1793715149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6044209268584109, "frac_reward_zero_std": 1.0, "grad_norm": 0.04330464042009147, "kl": 0.16943359375, "learning_rate": 8.119632635011529e-06, "loss": 0.0068, "num_tokens": 1794288685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6045916190151063, "frac_reward_zero_std": 1.0, "grad_norm": 0.03433857046075599, "kl": 0.161376953125, "learning_rate": 8.113781361066579e-06, "loss": 0.0065, "num_tokens": 1794853021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6047623111718017, "frac_reward_zero_std": 1.0, "grad_norm": 0.23565907319007634, "kl": 0.1904296875, "learning_rate": 8.107930756661155e-06, "loss": 0.0076, "num_tokens": 1795417101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6049330033284971, "frac_reward_zero_std": 1.0, "grad_norm": 0.05676023500393466, "kl": 0.17919921875, "learning_rate": 8.102080823872016e-06, "loss": 0.0072, "num_tokens": 1795983421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6051036954851925, "frac_reward_zero_std": 1.0, "grad_norm": 0.04255505128047755, "kl": 0.205078125, "learning_rate": 8.096231564775674e-06, "loss": 0.0082, "num_tokens": 1796551101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6052743876418879, "frac_reward_zero_std": 1.0, "grad_norm": 7.687623880140723, "kl": 1.08154296875, "learning_rate": 8.090382981448403e-06, "loss": 0.0433, "num_tokens": 1797118381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6054450797985833, "frac_reward_zero_std": 1.0, "grad_norm": 0.05976964176835345, "kl": 0.234130859375, "learning_rate": 8.084535075966237e-06, "loss": 0.0094, "num_tokens": 1797684861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6056157719552786, "frac_reward_zero_std": 1.0, "grad_norm": 0.043413641706769826, "kl": 0.23193359375, "learning_rate": 8.078687850404977e-06, "loss": 0.0093, "num_tokens": 1798250941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.605786464111974, "frac_reward_zero_std": 1.0, "grad_norm": 0.0709009267935203, "kl": 0.25341796875, "learning_rate": 8.07284130684018e-06, "loss": 0.0102, "num_tokens": 1798817741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6059571562686694, "frac_reward_zero_std": 1.0, "grad_norm": 0.08734245746153575, "kl": 0.2392578125, "learning_rate": 8.066995447347146e-06, "loss": 0.0096, "num_tokens": 1799385245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6061278484253648, "frac_reward_zero_std": 1.0, "grad_norm": 0.1182228120475803, "kl": 0.294189453125, "learning_rate": 8.061150274000954e-06, "loss": 0.0118, "num_tokens": 1799948461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6062985405820602, "frac_reward_zero_std": 1.0, "grad_norm": 0.1057983243229991, "kl": 0.25244140625, "learning_rate": 8.055305788876423e-06, "loss": 0.0101, "num_tokens": 1800516621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6064692327387556, "frac_reward_zero_std": 1.0, "grad_norm": 0.07792314829226041, "kl": 0.30517578125, "learning_rate": 8.049461994048143e-06, "loss": 0.0122, "num_tokens": 1801077197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.606639924895451, "frac_reward_zero_std": 1.0, "grad_norm": 0.03446937621042526, "kl": 0.25244140625, "learning_rate": 8.04361889159044e-06, "loss": 0.0101, "num_tokens": 1801642941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6068106170521465, "frac_reward_zero_std": 1.0, "grad_norm": 0.0543301023585408, "kl": 0.2412109375, "learning_rate": 8.03777648357741e-06, "loss": 0.0096, "num_tokens": 1802208461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6069813092088419, "frac_reward_zero_std": 1.0, "grad_norm": 0.07573842733237504, "kl": 0.26416015625, "learning_rate": 8.031934772082896e-06, "loss": 0.0106, "num_tokens": 1802773597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6071520013655373, "frac_reward_zero_std": 1.0, "grad_norm": 0.09883128249203817, "kl": 0.29052734375, "learning_rate": 8.0260937591805e-06, "loss": 0.0116, "num_tokens": 1803338141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6073226935222327, "frac_reward_zero_std": 1.0, "grad_norm": 0.027448555071947454, "kl": 0.2158203125, "learning_rate": 8.020253446943558e-06, "loss": 0.0086, "num_tokens": 1803902269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6074933856789281, "frac_reward_zero_std": 1.0, "grad_norm": 0.02399121182868635, "kl": 0.2109375, "learning_rate": 8.014413837445181e-06, "loss": 0.0085, "num_tokens": 1804465805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6076640778356235, "frac_reward_zero_std": 1.0, "grad_norm": 0.07189977145545562, "kl": 0.2216796875, "learning_rate": 8.008574932758215e-06, "loss": 0.0089, "num_tokens": 1805038797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6078347699923189, "frac_reward_zero_std": 1.0, "grad_norm": 0.1803623889807612, "kl": 0.25732421875, "learning_rate": 8.002736734955264e-06, "loss": 0.0103, "num_tokens": 1805610573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6080054621490143, "frac_reward_zero_std": 1.0, "grad_norm": 0.07068679087573276, "kl": 0.2939453125, "learning_rate": 7.996899246108674e-06, "loss": 0.0118, "num_tokens": 1806171501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6081761543057097, "frac_reward_zero_std": 1.0, "grad_norm": 0.05253818070115447, "kl": 0.302490234375, "learning_rate": 7.991062468290544e-06, "loss": 0.0121, "num_tokens": 1806734893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.608346846462405, "frac_reward_zero_std": 1.0, "grad_norm": 0.16214680405267573, "kl": 0.3740234375, "learning_rate": 7.985226403572717e-06, "loss": 0.015, "num_tokens": 1807296285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6085175386191004, "frac_reward_zero_std": 1.0, "grad_norm": 0.08033563977172206, "kl": 0.29541015625, "learning_rate": 7.979391054026791e-06, "loss": 0.0118, "num_tokens": 1807863101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6086882307757958, "frac_reward_zero_std": 1.0, "grad_norm": 0.03528328434777114, "kl": 0.259765625, "learning_rate": 7.973556421724098e-06, "loss": 0.0104, "num_tokens": 1808428573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6088589229324912, "frac_reward_zero_std": 1.0, "grad_norm": 0.08207984338211373, "kl": 0.26953125, "learning_rate": 7.967722508735725e-06, "loss": 0.0108, "num_tokens": 1808998477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6090296150891866, "frac_reward_zero_std": 1.0, "grad_norm": 0.05353228257267502, "kl": 0.24853515625, "learning_rate": 7.961889317132502e-06, "loss": 0.0099, "num_tokens": 1809569789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.609200307245882, "frac_reward_zero_std": 1.0, "grad_norm": 0.0827227422284897, "kl": 0.298828125, "learning_rate": 7.956056848985e-06, "loss": 0.012, "num_tokens": 1810137229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6093709994025774, "frac_reward_zero_std": 1.0, "grad_norm": 0.07014852442348611, "kl": 0.306640625, "learning_rate": 7.950225106363535e-06, "loss": 0.0123, "num_tokens": 1810701661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6095416915592728, "frac_reward_zero_std": 1.0, "grad_norm": 0.08297035219780174, "kl": 0.3291015625, "learning_rate": 7.944394091338163e-06, "loss": 0.0132, "num_tokens": 1811266141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6097123837159683, "frac_reward_zero_std": 1.0, "grad_norm": 0.139385507036363, "kl": 0.39697265625, "learning_rate": 7.938563805978684e-06, "loss": 0.0159, "num_tokens": 1811834541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6098830758726637, "frac_reward_zero_std": 1.0, "grad_norm": 0.25809409954764584, "kl": 0.419921875, "learning_rate": 7.932734252354646e-06, "loss": 0.0168, "num_tokens": 1812400317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6100537680293591, "frac_reward_zero_std": 1.0, "grad_norm": 0.1009119410880802, "kl": 0.3701171875, "learning_rate": 7.92690543253532e-06, "loss": 0.0148, "num_tokens": 1812969245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6102244601860545, "frac_reward_zero_std": 1.0, "grad_norm": 0.15894920430954257, "kl": 0.33251953125, "learning_rate": 7.92107734858973e-06, "loss": 0.0133, "num_tokens": 1813539549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6103951523427499, "frac_reward_zero_std": 1.0, "grad_norm": 0.1381924863553872, "kl": 0.314453125, "learning_rate": 7.915250002586639e-06, "loss": 0.0126, "num_tokens": 1814101949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6105658444994453, "frac_reward_zero_std": 1.0, "grad_norm": 0.12371110584536962, "kl": 0.2822265625, "learning_rate": 7.909423396594542e-06, "loss": 0.0113, "num_tokens": 1814670381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6107365366561407, "frac_reward_zero_std": 1.0, "grad_norm": 0.0961244957967037, "kl": 0.25390625, "learning_rate": 7.903597532681672e-06, "loss": 0.0102, "num_tokens": 1815232333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6109072288128361, "frac_reward_zero_std": 1.0, "grad_norm": 0.1517818975201803, "kl": 0.21142578125, "learning_rate": 7.897772412916003e-06, "loss": 0.0084, "num_tokens": 1815805085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6110779209695314, "frac_reward_zero_std": 1.0, "grad_norm": 0.07534350553558565, "kl": 0.1597900390625, "learning_rate": 7.89194803936524e-06, "loss": 0.0064, "num_tokens": 1816370573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6112486131262268, "frac_reward_zero_std": 1.0, "grad_norm": 0.10882164613298656, "kl": 0.152099609375, "learning_rate": 7.886124414096832e-06, "loss": 0.0061, "num_tokens": 1816938125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6114193052829222, "frac_reward_zero_std": 1.0, "grad_norm": 0.04332151447429159, "kl": 0.1346435546875, "learning_rate": 7.880301539177944e-06, "loss": 0.0054, "num_tokens": 1817500845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6115899974396176, "frac_reward_zero_std": 1.0, "grad_norm": 0.047327383096607625, "kl": 0.155029296875, "learning_rate": 7.874479416675495e-06, "loss": 0.0062, "num_tokens": 1818063005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.611760689596313, "frac_reward_zero_std": 1.0, "grad_norm": 0.035170251299915745, "kl": 0.1539306640625, "learning_rate": 7.868658048656125e-06, "loss": 0.0062, "num_tokens": 1818623789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6119313817530084, "frac_reward_zero_std": 1.0, "grad_norm": 0.04861404187360848, "kl": 0.14990234375, "learning_rate": 7.862837437186216e-06, "loss": 0.006, "num_tokens": 1819188509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6121020739097038, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591996318854533, "kl": 0.188720703125, "learning_rate": 7.857017584331865e-06, "loss": 0.0075, "num_tokens": 1819758317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6122727660663992, "frac_reward_zero_std": 1.0, "grad_norm": 0.08678846751684391, "kl": 0.18505859375, "learning_rate": 7.851198492158914e-06, "loss": 0.0074, "num_tokens": 1820326813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6124434582230946, "frac_reward_zero_std": 1.0, "grad_norm": 0.054436482717413415, "kl": 0.189453125, "learning_rate": 7.84538016273293e-06, "loss": 0.0076, "num_tokens": 1820893149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6126141503797901, "frac_reward_zero_std": 1.0, "grad_norm": 0.15051447627123155, "kl": 0.190185546875, "learning_rate": 7.839562598119217e-06, "loss": 0.0076, "num_tokens": 1821465021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6127848425364855, "frac_reward_zero_std": 1.0, "grad_norm": 0.035343741299404985, "kl": 0.160400390625, "learning_rate": 7.83374580038279e-06, "loss": 0.0064, "num_tokens": 1822035949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6129555346931809, "frac_reward_zero_std": 1.0, "grad_norm": 0.03152763312490635, "kl": 0.141845703125, "learning_rate": 7.827929771588408e-06, "loss": 0.0057, "num_tokens": 1822602941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6131262268498763, "frac_reward_zero_std": 1.0, "grad_norm": 0.07336397450585581, "kl": 0.162353515625, "learning_rate": 7.822114513800546e-06, "loss": 0.0065, "num_tokens": 1823172285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6132969190065717, "frac_reward_zero_std": 1.0, "grad_norm": 0.025273119200644307, "kl": 0.154052734375, "learning_rate": 7.81630002908342e-06, "loss": 0.0062, "num_tokens": 1823739549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6134676111632671, "frac_reward_zero_std": 1.0, "grad_norm": 0.028487088879489666, "kl": 0.203125, "learning_rate": 7.810486319500954e-06, "loss": 0.0081, "num_tokens": 1824303709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6136383033199625, "frac_reward_zero_std": 1.0, "grad_norm": 0.04740877350354846, "kl": 0.17333984375, "learning_rate": 7.804673387116808e-06, "loss": 0.0069, "num_tokens": 1824880109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6138089954766578, "frac_reward_zero_std": 1.0, "grad_norm": 0.08697304071765373, "kl": 0.242431640625, "learning_rate": 7.79886123399436e-06, "loss": 0.0097, "num_tokens": 1825448749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6139796876333532, "frac_reward_zero_std": 1.0, "grad_norm": 0.1625210483478368, "kl": 0.29931640625, "learning_rate": 7.793049862196721e-06, "loss": 0.012, "num_tokens": 1826011037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6141503797900486, "frac_reward_zero_std": 0.9375, "grad_norm": 0.13916861904710484, "kl": 0.26708984375, "learning_rate": 7.787239273786716e-06, "loss": 0.0107, "num_tokens": 1826579709.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 3598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.614321071946744, "frac_reward_zero_std": 1.0, "grad_norm": 0.2097364034342297, "kl": 0.304443359375, "learning_rate": 7.781429470826888e-06, "loss": 0.0122, "num_tokens": 1827149277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6144917641034394, "frac_reward_zero_std": 1.0, "grad_norm": 0.055331255605435364, "kl": 0.32373046875, "learning_rate": 7.775620455379514e-06, "loss": 0.013, "num_tokens": 1827714829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6146624562601348, "frac_reward_zero_std": 1.0, "grad_norm": 0.15290155091780613, "kl": 0.39794921875, "learning_rate": 7.769812229506587e-06, "loss": 0.0159, "num_tokens": 1828276253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6148331484168302, "frac_reward_zero_std": 1.0, "grad_norm": 0.12196009879618043, "kl": 0.3896484375, "learning_rate": 7.764004795269808e-06, "loss": 0.0156, "num_tokens": 1828838285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6150038405735256, "frac_reward_zero_std": 1.0, "grad_norm": 0.038315916185642715, "kl": 0.3779296875, "learning_rate": 7.758198154730611e-06, "loss": 0.0151, "num_tokens": 1829404765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.615174532730221, "frac_reward_zero_std": 1.0, "grad_norm": 0.04116726341668229, "kl": 0.375, "learning_rate": 7.752392309950147e-06, "loss": 0.015, "num_tokens": 1829966253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6153452248869165, "frac_reward_zero_std": 0.9375, "grad_norm": 0.2224774727326447, "kl": 0.39453125, "learning_rate": 7.746587262989282e-06, "loss": 0.0158, "num_tokens": 1830528941.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 3605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6155159170436119, "frac_reward_zero_std": 1.0, "grad_norm": 0.20340418081307332, "kl": 0.37939453125, "learning_rate": 7.740783015908592e-06, "loss": 0.0152, "num_tokens": 1831091005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6156866092003073, "frac_reward_zero_std": 1.0, "grad_norm": 0.1846943944857075, "kl": 0.3994140625, "learning_rate": 7.734979570768376e-06, "loss": 0.016, "num_tokens": 1831659085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6158573013570027, "frac_reward_zero_std": 0.9375, "grad_norm": 0.23192584905466632, "kl": 0.357421875, "learning_rate": 7.729176929628653e-06, "loss": 0.0143, "num_tokens": 1832227901.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 3608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6160279935136981, "frac_reward_zero_std": 1.0, "grad_norm": 0.04365852046587561, "kl": 0.3359375, "learning_rate": 7.723375094549151e-06, "loss": 0.0134, "num_tokens": 1832791549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6161986856703935, "frac_reward_zero_std": 1.0, "grad_norm": 0.034497825674213334, "kl": 0.330078125, "learning_rate": 7.717574067589302e-06, "loss": 0.0132, "num_tokens": 1833355517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6163693778270889, "frac_reward_zero_std": 0.875, "grad_norm": 0.12022449835867591, "kl": 0.31689453125, "learning_rate": 7.711773850808272e-06, "loss": 0.0127, "num_tokens": 1833923693.0, "reward": 0.001953125, "reward_std": 0.0078125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.001953125, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 3611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6165400699837843, "frac_reward_zero_std": 1.0, "grad_norm": 0.1518432995371963, "kl": 0.262939453125, "learning_rate": 7.705974446264925e-06, "loss": 0.0105, "num_tokens": 1834491229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6167107621404796, "frac_reward_zero_std": 1.0, "grad_norm": 0.132643414615078, "kl": 0.253662109375, "learning_rate": 7.70017585601784e-06, "loss": 0.0102, "num_tokens": 1835076477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.616881454297175, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0420178341993271, "kl": 0.230712890625, "learning_rate": 7.694378082125304e-06, "loss": 0.0092, "num_tokens": 1835640749.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 3614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6170521464538704, "frac_reward_zero_std": 1.0, "grad_norm": 0.10415728372031023, "kl": 0.25, "learning_rate": 7.688581126645325e-06, "loss": 0.01, "num_tokens": 1836209373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6172228386105658, "frac_reward_zero_std": 1.0, "grad_norm": 0.028999723479354006, "kl": 0.234130859375, "learning_rate": 7.682784991635603e-06, "loss": 0.0094, "num_tokens": 1836777773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6173935307672612, "frac_reward_zero_std": 1.0, "grad_norm": 0.1362801870102913, "kl": 0.279296875, "learning_rate": 7.676989679153571e-06, "loss": 0.0112, "num_tokens": 1837342781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6175642229239566, "frac_reward_zero_std": 1.0, "grad_norm": 0.05113202867220429, "kl": 0.27783203125, "learning_rate": 7.67119519125634e-06, "loss": 0.0111, "num_tokens": 1837904349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.617734915080652, "frac_reward_zero_std": 1.0, "grad_norm": 0.05539872293864689, "kl": 0.29931640625, "learning_rate": 7.665401530000754e-06, "loss": 0.012, "num_tokens": 1838473997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6179056072373474, "frac_reward_zero_std": 1.0, "grad_norm": 0.045346351916284965, "kl": 0.3466796875, "learning_rate": 7.659608697443348e-06, "loss": 0.0139, "num_tokens": 1839036397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6180762993940428, "frac_reward_zero_std": 1.0, "grad_norm": 0.1581269466952803, "kl": 0.39111328125, "learning_rate": 7.653816695640375e-06, "loss": 0.0157, "num_tokens": 1839601117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6182469915507383, "frac_reward_zero_std": 1.0, "grad_norm": 0.03762201539343697, "kl": 0.3642578125, "learning_rate": 7.64802552664778e-06, "loss": 0.0146, "num_tokens": 1840167565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6184176837074337, "frac_reward_zero_std": 1.0, "grad_norm": 0.0683703706605684, "kl": 0.38525390625, "learning_rate": 7.642235192521224e-06, "loss": 0.0154, "num_tokens": 1840729181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6185883758641291, "frac_reward_zero_std": 1.0, "grad_norm": 0.04444217905251959, "kl": 0.4033203125, "learning_rate": 7.63644569531606e-06, "loss": 0.0161, "num_tokens": 1841293069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6187590680208245, "frac_reward_zero_std": 1.0, "grad_norm": 0.062363292872309774, "kl": 0.40771484375, "learning_rate": 7.630657037087358e-06, "loss": 0.0163, "num_tokens": 1841855981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6189297601775199, "frac_reward_zero_std": 0.9375, "grad_norm": 0.15689855063672156, "kl": 0.3671875, "learning_rate": 7.62486921988988e-06, "loss": 0.0147, "num_tokens": 1842419965.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 3626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6191004523342153, "frac_reward_zero_std": 1.0, "grad_norm": 0.10907511583478367, "kl": 0.376953125, "learning_rate": 7.619082245778089e-06, "loss": 0.0151, "num_tokens": 1842985629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6192711444909107, "frac_reward_zero_std": 1.0, "grad_norm": 0.062136429143608195, "kl": 0.34326171875, "learning_rate": 7.6132961168061555e-06, "loss": 0.0137, "num_tokens": 1843550685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.619441836647606, "frac_reward_zero_std": 0.9375, "grad_norm": 0.04641588314750596, "kl": 0.33642578125, "learning_rate": 7.607510835027948e-06, "loss": 0.0134, "num_tokens": 1844116829.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 3629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6196125288043014, "frac_reward_zero_std": 1.0, "grad_norm": 0.034860148477316284, "kl": 0.33056640625, "learning_rate": 7.601726402497028e-06, "loss": 0.0132, "num_tokens": 1844683917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6197832209609968, "frac_reward_zero_std": 1.0, "grad_norm": 0.03656034916836168, "kl": 0.32177734375, "learning_rate": 7.595942821266661e-06, "loss": 0.0129, "num_tokens": 1845247469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6199539131176922, "frac_reward_zero_std": 1.0, "grad_norm": 0.05429703572459611, "kl": 0.3349609375, "learning_rate": 7.590160093389812e-06, "loss": 0.0134, "num_tokens": 1845812525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6201246052743876, "frac_reward_zero_std": 1.0, "grad_norm": 0.0741005457367062, "kl": 0.318359375, "learning_rate": 7.584378220919143e-06, "loss": 0.0127, "num_tokens": 1846391997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.620295297431083, "frac_reward_zero_std": 1.0, "grad_norm": 0.1047491950365762, "kl": 0.32421875, "learning_rate": 7.578597205907003e-06, "loss": 0.013, "num_tokens": 1846961293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6204659895877784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0329357955104672, "kl": 0.3173828125, "learning_rate": 7.572817050405446e-06, "loss": 0.0127, "num_tokens": 1847530989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6206366817444738, "frac_reward_zero_std": 1.0, "grad_norm": 0.09476493175363082, "kl": 0.32421875, "learning_rate": 7.567037756466222e-06, "loss": 0.013, "num_tokens": 1848099821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6208073739011692, "frac_reward_zero_std": 1.0, "grad_norm": 0.04758877640831901, "kl": 0.3369140625, "learning_rate": 7.561259326140773e-06, "loss": 0.0135, "num_tokens": 1848668189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6209780660578647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0330664702513441, "kl": 0.3203125, "learning_rate": 7.555481761480224e-06, "loss": 0.0128, "num_tokens": 1849235789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6211487582145601, "frac_reward_zero_std": 1.0, "grad_norm": 0.02831916444108976, "kl": 0.3447265625, "learning_rate": 7.54970506453541e-06, "loss": 0.0138, "num_tokens": 1849798301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6213194503712555, "frac_reward_zero_std": 1.0, "grad_norm": 0.029527189335095773, "kl": 0.3583984375, "learning_rate": 7.5439292373568465e-06, "loss": 0.0143, "num_tokens": 1850366221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6214901425279509, "frac_reward_zero_std": 1.0, "grad_norm": 0.04470794187582201, "kl": 0.3876953125, "learning_rate": 7.5381542819947485e-06, "loss": 0.0155, "num_tokens": 1850938493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6216608346846463, "frac_reward_zero_std": 1.0, "grad_norm": 0.04907770652752252, "kl": 0.3818359375, "learning_rate": 7.532380200499008e-06, "loss": 0.0153, "num_tokens": 1851506285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6218315268413417, "frac_reward_zero_std": 1.0, "grad_norm": 0.054964868156567696, "kl": 0.43017578125, "learning_rate": 7.526606994919222e-06, "loss": 0.0172, "num_tokens": 1852075037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6220022189980371, "frac_reward_zero_std": 1.0, "grad_norm": 0.19137638581515407, "kl": 0.4326171875, "learning_rate": 7.520834667304668e-06, "loss": 0.0173, "num_tokens": 1852640925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6221729111547324, "frac_reward_zero_std": 1.0, "grad_norm": 0.046206354601448285, "kl": 0.40185546875, "learning_rate": 7.515063219704318e-06, "loss": 0.0161, "num_tokens": 1853206093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6223436033114278, "frac_reward_zero_std": 1.0, "grad_norm": 0.07042754484248794, "kl": 0.41748046875, "learning_rate": 7.509292654166819e-06, "loss": 0.0167, "num_tokens": 1853774605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6225142954681232, "frac_reward_zero_std": 1.0, "grad_norm": 0.04624209567364347, "kl": 0.4267578125, "learning_rate": 7.50352297274052e-06, "loss": 0.0171, "num_tokens": 1854340637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6226849876248186, "frac_reward_zero_std": 1.0, "grad_norm": 0.07226865704377775, "kl": 0.39892578125, "learning_rate": 7.497754177473446e-06, "loss": 0.0159, "num_tokens": 1854903565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.622855679781514, "frac_reward_zero_std": 1.0, "grad_norm": 0.051710571559691894, "kl": 0.4384765625, "learning_rate": 7.491986270413321e-06, "loss": 0.0175, "num_tokens": 1855467501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6230263719382094, "frac_reward_zero_std": 1.0, "grad_norm": 0.06029110903781817, "kl": 0.404296875, "learning_rate": 7.4862192536075275e-06, "loss": 0.0162, "num_tokens": 1856028621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6231970640949048, "frac_reward_zero_std": 1.0, "grad_norm": 0.05624493596606582, "kl": 0.3935546875, "learning_rate": 7.4804531291031605e-06, "loss": 0.0157, "num_tokens": 1856595613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6233677562516002, "frac_reward_zero_std": 1.0, "grad_norm": 0.07780077071041168, "kl": 0.41650390625, "learning_rate": 7.474687898946979e-06, "loss": 0.0167, "num_tokens": 1857159277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6235384484082956, "frac_reward_zero_std": 1.0, "grad_norm": 0.10573585474711139, "kl": 0.47509765625, "learning_rate": 7.46892356518544e-06, "loss": 0.019, "num_tokens": 1857720797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.623709140564991, "frac_reward_zero_std": 1.0, "grad_norm": 0.13112810098853772, "kl": 0.44970703125, "learning_rate": 7.463160129864666e-06, "loss": 0.018, "num_tokens": 1858291997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6238798327216865, "frac_reward_zero_std": 1.0, "grad_norm": 0.17571108211591363, "kl": 0.40673828125, "learning_rate": 7.457397595030471e-06, "loss": 0.0163, "num_tokens": 1858857245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6240505248783819, "frac_reward_zero_std": 1.0, "grad_norm": 0.16202260884616562, "kl": 0.30419921875, "learning_rate": 7.451635962728346e-06, "loss": 0.0122, "num_tokens": 1859427789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6242212170350773, "frac_reward_zero_std": 1.0, "grad_norm": 0.7309284564605232, "kl": 0.332763671875, "learning_rate": 7.445875235003465e-06, "loss": 0.0133, "num_tokens": 1859989133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6243919091917727, "frac_reward_zero_std": 1.0, "grad_norm": 0.17339063515358413, "kl": 0.2412109375, "learning_rate": 7.440115413900678e-06, "loss": 0.0096, "num_tokens": 1860557613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6245626013484681, "frac_reward_zero_std": 1.0, "grad_norm": 0.15739520116270486, "kl": 0.2568359375, "learning_rate": 7.43435650146451e-06, "loss": 0.0103, "num_tokens": 1861126445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6247332935051635, "frac_reward_zero_std": 1.0, "grad_norm": 0.18425141499283298, "kl": 0.2626953125, "learning_rate": 7.42859849973917e-06, "loss": 0.0105, "num_tokens": 1861688557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6249039856618588, "frac_reward_zero_std": 1.0, "grad_norm": 0.18343698191677607, "kl": 0.226806640625, "learning_rate": 7.422841410768544e-06, "loss": 0.0091, "num_tokens": 1862260221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6250746778185542, "frac_reward_zero_std": 1.0, "grad_norm": 0.09637430348885215, "kl": 0.193115234375, "learning_rate": 7.417085236596184e-06, "loss": 0.0077, "num_tokens": 1862825901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6252453699752496, "frac_reward_zero_std": 1.0, "grad_norm": 0.06664238672637715, "kl": 0.123046875, "learning_rate": 7.4113299792653255e-06, "loss": 0.0049, "num_tokens": 1863394861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.625416062131945, "frac_reward_zero_std": 1.0, "grad_norm": 0.027165682209197847, "kl": 0.0855712890625, "learning_rate": 7.405575640818881e-06, "loss": 0.0034, "num_tokens": 1863963085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6255867542886404, "frac_reward_zero_std": 1.0, "grad_norm": 0.1629103233623661, "kl": 0.0997314453125, "learning_rate": 7.3998222232994335e-06, "loss": 0.004, "num_tokens": 1864530205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6257574464453358, "frac_reward_zero_std": 1.0, "grad_norm": 0.04132207076212743, "kl": 0.08837890625, "learning_rate": 7.394069728749232e-06, "loss": 0.0035, "num_tokens": 1865096845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6259281386020312, "frac_reward_zero_std": 1.0, "grad_norm": 0.042595357799526466, "kl": 0.0948486328125, "learning_rate": 7.388318159210207e-06, "loss": 0.0038, "num_tokens": 1865661949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6260988307587266, "frac_reward_zero_std": 1.0, "grad_norm": 0.053665085173470864, "kl": 0.1258544921875, "learning_rate": 7.382567516723962e-06, "loss": 0.005, "num_tokens": 1866228861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.626269522915422, "frac_reward_zero_std": 1.0, "grad_norm": 0.06447771555565596, "kl": 0.184814453125, "learning_rate": 7.376817803331765e-06, "loss": 0.0074, "num_tokens": 1866793213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6264402150721174, "frac_reward_zero_std": 1.0, "grad_norm": 0.08007983739489181, "kl": 0.19775390625, "learning_rate": 7.371069021074553e-06, "loss": 0.0079, "num_tokens": 1867360493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6266109072288129, "frac_reward_zero_std": 1.0, "grad_norm": 0.07131947075557436, "kl": 0.16796875, "learning_rate": 7.3653211719929406e-06, "loss": 0.0067, "num_tokens": 1867935741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6267815993855083, "frac_reward_zero_std": 1.0, "grad_norm": 0.09126919558894152, "kl": 0.1312255859375, "learning_rate": 7.359574258127208e-06, "loss": 0.0052, "num_tokens": 1868500333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6269522915422037, "frac_reward_zero_std": 1.0, "grad_norm": 0.03711674806876098, "kl": 0.1116943359375, "learning_rate": 7.3538282815173e-06, "loss": 0.0045, "num_tokens": 1869063949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6271229836988991, "frac_reward_zero_std": 1.0, "grad_norm": 0.04982078247099575, "kl": 0.1260986328125, "learning_rate": 7.348083244202829e-06, "loss": 0.005, "num_tokens": 1869627149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6272936758555945, "frac_reward_zero_std": 1.0, "grad_norm": 0.10741323532167776, "kl": 0.194580078125, "learning_rate": 7.342339148223076e-06, "loss": 0.0078, "num_tokens": 1870186861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6274643680122899, "frac_reward_zero_std": 1.0, "grad_norm": 0.04117944081660176, "kl": 0.25634765625, "learning_rate": 7.33659599561699e-06, "loss": 0.0102, "num_tokens": 1870756285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6276350601689852, "frac_reward_zero_std": 1.0, "grad_norm": 0.053349653303561666, "kl": 0.27734375, "learning_rate": 7.330853788423187e-06, "loss": 0.0111, "num_tokens": 1871321469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6278057523256806, "frac_reward_zero_std": 1.0, "grad_norm": 0.05275348489045435, "kl": 0.2802734375, "learning_rate": 7.325112528679933e-06, "loss": 0.0112, "num_tokens": 1871887085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.627976444482376, "frac_reward_zero_std": 1.0, "grad_norm": 0.1224160676726237, "kl": 0.28662109375, "learning_rate": 7.319372218425176e-06, "loss": 0.0115, "num_tokens": 1872446589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6281471366390714, "frac_reward_zero_std": 1.0, "grad_norm": 0.09788436643702554, "kl": 0.37744140625, "learning_rate": 7.313632859696513e-06, "loss": 0.0151, "num_tokens": 1873010765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6283178287957668, "frac_reward_zero_std": 1.0, "grad_norm": 0.4587833009204419, "kl": 0.4775390625, "learning_rate": 7.307894454531217e-06, "loss": 0.0191, "num_tokens": 1873580989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6284885209524622, "frac_reward_zero_std": 1.0, "grad_norm": 0.9453076855841438, "kl": 0.4814453125, "learning_rate": 7.302157004966208e-06, "loss": 0.0193, "num_tokens": 1874146077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6286592131091576, "frac_reward_zero_std": 1.0, "grad_norm": 0.006748736428871016, "kl": 0.027618408203125, "learning_rate": 7.296420513038077e-06, "loss": 0.0011, "num_tokens": 1874715389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.628829905265853, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010603375774620298, "kl": 0.01715087890625, "learning_rate": 7.290684980783067e-06, "loss": 0.0007, "num_tokens": 1875277133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6290005974225484, "frac_reward_zero_std": 1.0, "grad_norm": 2.1841846109640562e-10, "kl": 0.0166015625, "learning_rate": 7.284950410237093e-06, "loss": 0.0007, "num_tokens": 1875836877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6291712895792438, "frac_reward_zero_std": 1.0, "grad_norm": 3.353213525161813e-12, "kl": 0.0169677734375, "learning_rate": 7.2792168034357135e-06, "loss": 0.0007, "num_tokens": 1876399069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6293419817359392, "frac_reward_zero_std": 1.0, "grad_norm": 7.248391551936577e-14, "kl": 0.017120361328125, "learning_rate": 7.2734841624141515e-06, "loss": 0.0007, "num_tokens": 1876961277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6295126738926347, "frac_reward_zero_std": 1.0, "grad_norm": 1.840265142939916e-15, "kl": 0.0167236328125, "learning_rate": 7.2677524892072936e-06, "loss": 0.0007, "num_tokens": 1877529261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6296833660493301, "frac_reward_zero_std": 1.0, "grad_norm": 6.166261567079576e-17, "kl": 0.017181396484375, "learning_rate": 7.262021785849673e-06, "loss": 0.0007, "num_tokens": 1878092605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6298540582060255, "frac_reward_zero_std": 1.0, "grad_norm": 3.729073310415294e-18, "kl": 0.016876220703125, "learning_rate": 7.256292054375484e-06, "loss": 0.0007, "num_tokens": 1878655389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6300247503627209, "frac_reward_zero_std": 1.0, "grad_norm": 4.532499953617755e-19, "kl": 0.017364501953125, "learning_rate": 7.2505632968185705e-06, "loss": 0.0007, "num_tokens": 1879220957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6301954425194163, "frac_reward_zero_std": 1.0, "grad_norm": 8.649396848563948e-20, "kl": 0.01690673828125, "learning_rate": 7.244835515212438e-06, "loss": 0.0007, "num_tokens": 1879790557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6303661346761116, "frac_reward_zero_std": 1.0, "grad_norm": 2.2323106849931384e-20, "kl": 0.016998291015625, "learning_rate": 7.239108711590246e-06, "loss": 0.0007, "num_tokens": 1880355229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.630536826832807, "frac_reward_zero_std": 1.0, "grad_norm": 8.06516208532409e-21, "kl": 0.016693115234375, "learning_rate": 7.2333828879847964e-06, "loss": 0.0007, "num_tokens": 1880916653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6307075189895024, "frac_reward_zero_std": 1.0, "grad_norm": 4.1636040462829615e-21, "kl": 0.0166015625, "learning_rate": 7.22765804642855e-06, "loss": 0.0007, "num_tokens": 1881479965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6308782111461978, "frac_reward_zero_std": 1.0, "grad_norm": 2.356175123471552e-21, "kl": 0.016845703125, "learning_rate": 7.221934188953623e-06, "loss": 0.0007, "num_tokens": 1882044301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6310489033028932, "frac_reward_zero_std": 1.0, "grad_norm": 1.683827196615927e-21, "kl": 0.017425537109375, "learning_rate": 7.21621131759178e-06, "loss": 0.0007, "num_tokens": 1882609789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6312195954595886, "frac_reward_zero_std": 1.0, "grad_norm": 1.3715597315050698e-21, "kl": 0.01678466796875, "learning_rate": 7.210489434374428e-06, "loss": 0.0007, "num_tokens": 1883177613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.631390287616284, "frac_reward_zero_std": 1.0, "grad_norm": 1.1289093095111681e-21, "kl": 0.01666259765625, "learning_rate": 7.2047685413326275e-06, "loss": 0.0007, "num_tokens": 1883751261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6315609797729794, "frac_reward_zero_std": 1.0, "grad_norm": 1.1459128712797313e-21, "kl": 0.017181396484375, "learning_rate": 7.199048640497097e-06, "loss": 0.0007, "num_tokens": 1884328045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6317316719296748, "frac_reward_zero_std": 1.0, "grad_norm": 9.292794486366438e-22, "kl": 0.017181396484375, "learning_rate": 7.193329733898191e-06, "loss": 0.0007, "num_tokens": 1884905885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6319023640863702, "frac_reward_zero_std": 1.0, "grad_norm": 1.0617895617932545e-21, "kl": 0.0172119140625, "learning_rate": 7.187611823565911e-06, "loss": 0.0007, "num_tokens": 1885468973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6320730562430656, "frac_reward_zero_std": 1.0, "grad_norm": 1.1993809606592362e-21, "kl": 0.016998291015625, "learning_rate": 7.1818949115299145e-06, "loss": 0.0007, "num_tokens": 1886033277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.632243748399761, "frac_reward_zero_std": 1.0, "grad_norm": 9.786810323156023e-22, "kl": 0.016754150390625, "learning_rate": 7.1761789998194965e-06, "loss": 0.0007, "num_tokens": 1886598941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6324144405564565, "frac_reward_zero_std": 1.0, "grad_norm": 8.678090162790934e-22, "kl": 0.0169677734375, "learning_rate": 7.1704640904636e-06, "loss": 0.0007, "num_tokens": 1887179069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6325851327131519, "frac_reward_zero_std": 1.0, "grad_norm": 1.0051719047741036e-21, "kl": 0.0172119140625, "learning_rate": 7.1647501854908095e-06, "loss": 0.0007, "num_tokens": 1887743165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6327558248698473, "frac_reward_zero_std": 1.0, "grad_norm": 8.345112933874138e-22, "kl": 0.016693115234375, "learning_rate": 7.1590372869293564e-06, "loss": 0.0007, "num_tokens": 1888306957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6329265170265427, "frac_reward_zero_std": 1.0, "grad_norm": 9.1820573592185e-22, "kl": 0.017120361328125, "learning_rate": 7.153325396807112e-06, "loss": 0.0007, "num_tokens": 1888871757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6330972091832381, "frac_reward_zero_std": 1.0, "grad_norm": 1.0273875721092744e-21, "kl": 0.016693115234375, "learning_rate": 7.1476145171515994e-06, "loss": 0.0007, "num_tokens": 1889436557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6332679013399334, "frac_reward_zero_std": 1.0, "grad_norm": 9.35470315344771e-22, "kl": 0.017120361328125, "learning_rate": 7.1419046499899626e-06, "loss": 0.0007, "num_tokens": 1890003181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6334385934966288, "frac_reward_zero_std": 1.0, "grad_norm": 1.005961308188686e-21, "kl": 0.016754150390625, "learning_rate": 7.136195797349005e-06, "loss": 0.0007, "num_tokens": 1890564877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6336092856533242, "frac_reward_zero_std": 1.0, "grad_norm": 1.0310186760651327e-21, "kl": 0.01678466796875, "learning_rate": 7.130487961255159e-06, "loss": 0.0007, "num_tokens": 1891132029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6337799778100196, "frac_reward_zero_std": 1.0, "grad_norm": 1.1325348952400624e-21, "kl": 0.016937255859375, "learning_rate": 7.1247811437345095e-06, "loss": 0.0007, "num_tokens": 1891701725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.633950669966715, "frac_reward_zero_std": 1.0, "grad_norm": 9.669016580095227e-22, "kl": 0.017120361328125, "learning_rate": 7.119075346812761e-06, "loss": 0.0007, "num_tokens": 1892267261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6341213621234104, "frac_reward_zero_std": 1.0, "grad_norm": 150.10904411547784, "kl": 4.234375, "learning_rate": 7.113370572515269e-06, "loss": 0.1699, "num_tokens": 1892857309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6342920542801058, "frac_reward_zero_std": 1.0, "grad_norm": 5.18335109398466e-23, "kl": 0.0172119140625, "learning_rate": 7.107666822867021e-06, "loss": 0.0007, "num_tokens": 1893422205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6344627464368012, "frac_reward_zero_std": 1.0, "grad_norm": 7.454573829930535e-23, "kl": 0.016876220703125, "learning_rate": 7.101964099892647e-06, "loss": 0.0007, "num_tokens": 1893987709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6346334385934966, "frac_reward_zero_std": 1.0, "grad_norm": 1.8611498085431604e-22, "kl": 0.016571044921875, "learning_rate": 7.096262405616403e-06, "loss": 0.0007, "num_tokens": 1894551773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.634804130750192, "frac_reward_zero_std": 1.0, "grad_norm": 4.1025760394452833e-22, "kl": 0.0172119140625, "learning_rate": 7.090561742062183e-06, "loss": 0.0007, "num_tokens": 1895120925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6349748229068874, "frac_reward_zero_std": 1.0, "grad_norm": 6.979320505143568e-22, "kl": 0.017120361328125, "learning_rate": 7.084862111253523e-06, "loss": 0.0007, "num_tokens": 1895687357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6351455150635829, "frac_reward_zero_std": 1.0, "grad_norm": 1.0942054886299408e-21, "kl": 0.017120361328125, "learning_rate": 7.079163515213585e-06, "loss": 0.0007, "num_tokens": 1896260733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6353162072202783, "frac_reward_zero_std": 1.0, "grad_norm": 1.3617598583960941e-21, "kl": 0.017242431640625, "learning_rate": 7.0734659559651606e-06, "loss": 0.0007, "num_tokens": 1896829421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6354868993769737, "frac_reward_zero_std": 1.0, "grad_norm": 2.0193153899121654e-21, "kl": 0.0172119140625, "learning_rate": 7.067769435530678e-06, "loss": 0.0007, "num_tokens": 1897395725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6356575915336691, "frac_reward_zero_std": 1.0, "grad_norm": 1.950135771066212e-21, "kl": 0.01678466796875, "learning_rate": 7.062073955932201e-06, "loss": 0.0007, "num_tokens": 1897958749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6358282836903645, "frac_reward_zero_std": 1.0, "grad_norm": 2.400041091661987e-21, "kl": 0.016754150390625, "learning_rate": 7.056379519191418e-06, "loss": 0.0007, "num_tokens": 1898527213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6359989758470598, "frac_reward_zero_std": 1.0, "grad_norm": 2.5518828286192175e-21, "kl": 0.0167236328125, "learning_rate": 7.050686127329644e-06, "loss": 0.0007, "num_tokens": 1899089949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6361696680037552, "frac_reward_zero_std": 1.0, "grad_norm": 2.7754000873684743e-21, "kl": 0.01690673828125, "learning_rate": 7.044993782367831e-06, "loss": 0.0007, "num_tokens": 1899657069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6363403601604506, "frac_reward_zero_std": 1.0, "grad_norm": 3.2948599176979203e-21, "kl": 0.0169677734375, "learning_rate": 7.039302486326556e-06, "loss": 0.0007, "num_tokens": 1900221469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.636511052317146, "frac_reward_zero_std": 1.0, "grad_norm": 3.528156243209661e-21, "kl": 0.01727294921875, "learning_rate": 7.033612241226026e-06, "loss": 0.0007, "num_tokens": 1900799773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6366817444738414, "frac_reward_zero_std": 1.0, "grad_norm": 3.449413876627948e-21, "kl": 0.01751708984375, "learning_rate": 7.027923049086066e-06, "loss": 0.0007, "num_tokens": 1901367053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6368524366305368, "frac_reward_zero_std": 1.0, "grad_norm": 3.5576494507975515e-21, "kl": 0.017120361328125, "learning_rate": 7.02223491192614e-06, "loss": 0.0007, "num_tokens": 1901932221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6370231287872322, "frac_reward_zero_std": 1.0, "grad_norm": 3.721714895463822e-21, "kl": 0.017364501953125, "learning_rate": 7.016547831765328e-06, "loss": 0.0007, "num_tokens": 1902495965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6371938209439276, "frac_reward_zero_std": 1.0, "grad_norm": 4.6389481888862045e-21, "kl": 0.017486572265625, "learning_rate": 7.0108618106223425e-06, "loss": 0.0007, "num_tokens": 1903061117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.637364513100623, "frac_reward_zero_std": 1.0, "grad_norm": 4.44047069190491e-21, "kl": 0.01751708984375, "learning_rate": 7.005176850515507e-06, "loss": 0.0007, "num_tokens": 1903626189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6375352052573184, "frac_reward_zero_std": 1.0, "grad_norm": 4.167811383289676e-21, "kl": 0.0172119140625, "learning_rate": 6.999492953462786e-06, "loss": 0.0007, "num_tokens": 1904188157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6377058974140138, "frac_reward_zero_std": 1.0, "grad_norm": 3.5941252170428295e-21, "kl": 0.017486572265625, "learning_rate": 6.993810121481752e-06, "loss": 0.0007, "num_tokens": 1904754909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6378765895707093, "frac_reward_zero_std": 1.0, "grad_norm": 4.707793290992946e-21, "kl": 0.017242431640625, "learning_rate": 6.988128356589614e-06, "loss": 0.0007, "num_tokens": 1905325693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6380472817274047, "frac_reward_zero_std": 1.0, "grad_norm": 4.488370308416614e-21, "kl": 0.01678466796875, "learning_rate": 6.98244766080318e-06, "loss": 0.0007, "num_tokens": 1905887181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6382179738841001, "frac_reward_zero_std": 1.0, "grad_norm": 4.594516910871476e-21, "kl": 0.01708984375, "learning_rate": 6.976768036138902e-06, "loss": 0.0007, "num_tokens": 1906460845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6383886660407955, "frac_reward_zero_std": 1.0, "grad_norm": 3.973660680756128e-21, "kl": 0.016998291015625, "learning_rate": 6.971089484612836e-06, "loss": 0.0007, "num_tokens": 1907036925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6385593581974909, "frac_reward_zero_std": 1.0, "grad_norm": 3.947777477071196e-21, "kl": 0.017120361328125, "learning_rate": 6.965412008240673e-06, "loss": 0.0007, "num_tokens": 1907604797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6387300503541862, "frac_reward_zero_std": 1.0, "grad_norm": 4.340302286720988e-21, "kl": 0.017303466796875, "learning_rate": 6.959735609037699e-06, "loss": 0.0007, "num_tokens": 1908172093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6389007425108816, "frac_reward_zero_std": 1.0, "grad_norm": 3.8445252476693504e-21, "kl": 0.01666259765625, "learning_rate": 6.954060289018836e-06, "loss": 0.0007, "num_tokens": 1908733885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.639071434667577, "frac_reward_zero_std": 1.0, "grad_norm": 4.874085581669861e-21, "kl": 0.017120361328125, "learning_rate": 6.9483860501986185e-06, "loss": 0.0007, "num_tokens": 1909300925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6392421268242724, "frac_reward_zero_std": 1.0, "grad_norm": 4.805317833687837e-21, "kl": 0.0167236328125, "learning_rate": 6.942712894591199e-06, "loss": 0.0007, "num_tokens": 1909862381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6394128189809678, "frac_reward_zero_std": 1.0, "grad_norm": 4.259917105877894e-21, "kl": 0.01708984375, "learning_rate": 6.937040824210339e-06, "loss": 0.0007, "num_tokens": 1910425309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6395835111376632, "frac_reward_zero_std": 1.0, "grad_norm": 5.4385141187375165e-21, "kl": 0.017364501953125, "learning_rate": 6.93136984106942e-06, "loss": 0.0007, "num_tokens": 1911013981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6397542032943586, "frac_reward_zero_std": 1.0, "grad_norm": 5.366448975212658e-21, "kl": 0.016876220703125, "learning_rate": 6.925699947181433e-06, "loss": 0.0007, "num_tokens": 1911578973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.639924895451054, "frac_reward_zero_std": 1.0, "grad_norm": 4.736980524649437e-21, "kl": 0.017181396484375, "learning_rate": 6.9200311445589944e-06, "loss": 0.0007, "num_tokens": 1912151853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6400955876077494, "frac_reward_zero_std": 1.0, "grad_norm": 4.722035261727249e-21, "kl": 0.017120361328125, "learning_rate": 6.914363435214316e-06, "loss": 0.0007, "num_tokens": 1912719181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6402662797644448, "frac_reward_zero_std": 1.0, "grad_norm": 4.645316481672532e-21, "kl": 0.0166015625, "learning_rate": 6.908696821159231e-06, "loss": 0.0007, "num_tokens": 1913283245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6404369719211402, "frac_reward_zero_std": 1.0, "grad_norm": 4.9887919342494596e-21, "kl": 0.016571044921875, "learning_rate": 6.9030313044051925e-06, "loss": 0.0007, "num_tokens": 1913843917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6406076640778356, "frac_reward_zero_std": 1.0, "grad_norm": 4.7254922035692155e-21, "kl": 0.016998291015625, "learning_rate": 6.89736688696324e-06, "loss": 0.0007, "num_tokens": 1914409309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.640778356234531, "frac_reward_zero_std": 1.0, "grad_norm": 5.3799404445369384e-21, "kl": 0.01678466796875, "learning_rate": 6.891703570844044e-06, "loss": 0.0007, "num_tokens": 1914974557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6409490483912265, "frac_reward_zero_std": 1.0, "grad_norm": 4.828117803974265e-21, "kl": 0.017669677734375, "learning_rate": 6.886041358057877e-06, "loss": 0.0007, "num_tokens": 1915539821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6411197405479219, "frac_reward_zero_std": 1.0, "grad_norm": 4.840082385661458e-21, "kl": 0.016845703125, "learning_rate": 6.880380250614624e-06, "loss": 0.0007, "num_tokens": 1916105325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6412904327046173, "frac_reward_zero_std": 1.0, "grad_norm": 4.775759714076835e-21, "kl": 0.017303466796875, "learning_rate": 6.874720250523769e-06, "loss": 0.0007, "num_tokens": 1916669165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6414611248613126, "frac_reward_zero_std": 1.0, "grad_norm": 4.744699163451872e-21, "kl": 0.0166015625, "learning_rate": 6.869061359794408e-06, "loss": 0.0007, "num_tokens": 1917227533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.641631817018008, "frac_reward_zero_std": 1.0, "grad_norm": 5.151407886019201e-21, "kl": 0.016845703125, "learning_rate": 6.863403580435242e-06, "loss": 0.0007, "num_tokens": 1917793181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6418025091747034, "frac_reward_zero_std": 1.0, "grad_norm": 4.9347596683691685e-21, "kl": 0.01715087890625, "learning_rate": 6.857746914454585e-06, "loss": 0.0007, "num_tokens": 1918367165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6419732013313988, "frac_reward_zero_std": 1.0, "grad_norm": 4.887164698468726e-21, "kl": 0.01702880859375, "learning_rate": 6.8520913638603425e-06, "loss": 0.0007, "num_tokens": 1918933245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6421438934880942, "frac_reward_zero_std": 1.0, "grad_norm": 4.548905639460651e-21, "kl": 0.01690673828125, "learning_rate": 6.846436930660029e-06, "loss": 0.0007, "num_tokens": 1919498269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6423145856447896, "frac_reward_zero_std": 1.0, "grad_norm": 4.432366651773886e-21, "kl": 0.0164794921875, "learning_rate": 6.840783616860772e-06, "loss": 0.0007, "num_tokens": 1920067533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.642485277801485, "frac_reward_zero_std": 1.0, "grad_norm": 4.267808227998772e-21, "kl": 0.016693115234375, "learning_rate": 6.835131424469292e-06, "loss": 0.0007, "num_tokens": 1920633501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6426559699581804, "frac_reward_zero_std": 1.0, "grad_norm": 4.19170410236526e-21, "kl": 0.016998291015625, "learning_rate": 6.829480355491909e-06, "loss": 0.0007, "num_tokens": 1921202861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6428266621148758, "frac_reward_zero_std": 1.0, "grad_norm": 4.964738815144513e-21, "kl": 0.01702880859375, "learning_rate": 6.823830411934547e-06, "loss": 0.0007, "num_tokens": 1921769101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6429973542715712, "frac_reward_zero_std": 1.0, "grad_norm": 4.130262044232339e-21, "kl": 0.017333984375, "learning_rate": 6.818181595802739e-06, "loss": 0.0007, "num_tokens": 1922333213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6431680464282666, "frac_reward_zero_std": 1.0, "grad_norm": 4.82921554730933e-21, "kl": 0.0169677734375, "learning_rate": 6.812533909101609e-06, "loss": 0.0007, "num_tokens": 1922901181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.643338738584962, "frac_reward_zero_std": 1.0, "grad_norm": 5.055280158811875e-21, "kl": 0.017059326171875, "learning_rate": 6.8068873538358785e-06, "loss": 0.0007, "num_tokens": 1923471741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6435094307416575, "frac_reward_zero_std": 1.0, "grad_norm": 5.15658956727301e-21, "kl": 0.01708984375, "learning_rate": 6.80124193200987e-06, "loss": 0.0007, "num_tokens": 1924037101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6436801228983529, "frac_reward_zero_std": 1.0, "grad_norm": 4.691280861473944e-21, "kl": 0.016754150390625, "learning_rate": 6.7955976456275096e-06, "loss": 0.0007, "num_tokens": 1924602893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6438508150550483, "frac_reward_zero_std": 1.0, "grad_norm": 4.697611452796789e-21, "kl": 0.01678466796875, "learning_rate": 6.789954496692316e-06, "loss": 0.0007, "num_tokens": 1925167581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6440215072117437, "frac_reward_zero_std": 1.0, "grad_norm": 4.801428813113937e-21, "kl": 0.01666259765625, "learning_rate": 6.784312487207394e-06, "loss": 0.0007, "num_tokens": 1925735101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.644192199368439, "frac_reward_zero_std": 1.0, "grad_norm": 5.2522337477478026e-21, "kl": 0.01690673828125, "learning_rate": 6.778671619175463e-06, "loss": 0.0007, "num_tokens": 1926305981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6443628915251344, "frac_reward_zero_std": 1.0, "grad_norm": 4.343864057375075e-21, "kl": 0.01666259765625, "learning_rate": 6.773031894598823e-06, "loss": 0.0007, "num_tokens": 1926871133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6445335836818298, "frac_reward_zero_std": 1.0, "grad_norm": 5.05624982031695e-21, "kl": 0.016815185546875, "learning_rate": 6.767393315479376e-06, "loss": 0.0007, "num_tokens": 1927434013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6447042758385252, "frac_reward_zero_std": 1.0, "grad_norm": 4.353048219228506e-21, "kl": 0.01727294921875, "learning_rate": 6.761755883818608e-06, "loss": 0.0007, "num_tokens": 1928001725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6448749679952206, "frac_reward_zero_std": 1.0, "grad_norm": 4.969941808617112e-21, "kl": 0.016998291015625, "learning_rate": 6.756119601617608e-06, "loss": 0.0007, "num_tokens": 1928565933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.645045660151916, "frac_reward_zero_std": 1.0, "grad_norm": 4.6547672801697065e-21, "kl": 0.01678466796875, "learning_rate": 6.7504844708770516e-06, "loss": 0.0007, "num_tokens": 1929128477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6452163523086114, "frac_reward_zero_std": 1.0, "grad_norm": 5.357721478189609e-21, "kl": 0.01690673828125, "learning_rate": 6.744850493597213e-06, "loss": 0.0007, "num_tokens": 1929690925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6453870444653068, "frac_reward_zero_std": 1.0, "grad_norm": 5.18275304776346e-21, "kl": 0.016632080078125, "learning_rate": 6.739217671777938e-06, "loss": 0.0007, "num_tokens": 1930257981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6455577366220022, "frac_reward_zero_std": 1.0, "grad_norm": 4.854690638636772e-21, "kl": 0.017120361328125, "learning_rate": 6.733586007418684e-06, "loss": 0.0007, "num_tokens": 1930822957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6457284287786976, "frac_reward_zero_std": 1.0, "grad_norm": 4.801821227032478e-21, "kl": 0.0174560546875, "learning_rate": 6.727955502518485e-06, "loss": 0.0007, "num_tokens": 1931387421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.645899120935393, "frac_reward_zero_std": 1.0, "grad_norm": 5.0003791134913105e-21, "kl": 0.0172119140625, "learning_rate": 6.7223261590759735e-06, "loss": 0.0007, "num_tokens": 1931955549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6460698130920884, "frac_reward_zero_std": 1.0, "grad_norm": 4.669534351281389e-21, "kl": 0.0166015625, "learning_rate": 6.716697979089355e-06, "loss": 0.0007, "num_tokens": 1932527549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6462405052487838, "frac_reward_zero_std": 1.0, "grad_norm": 4.7183613061658804e-21, "kl": 0.017059326171875, "learning_rate": 6.711070964556434e-06, "loss": 0.0007, "num_tokens": 1933093005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6464111974054793, "frac_reward_zero_std": 1.0, "grad_norm": 5.257690561222004e-21, "kl": 0.016845703125, "learning_rate": 6.7054451174745945e-06, "loss": 0.0007, "num_tokens": 1933656541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6465818895621747, "frac_reward_zero_std": 1.0, "grad_norm": 5.193553630030201e-21, "kl": 0.01715087890625, "learning_rate": 6.699820439840817e-06, "loss": 0.0007, "num_tokens": 1934220493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6467525817188701, "frac_reward_zero_std": 1.0, "grad_norm": 4.740561757569585e-21, "kl": 0.016571044921875, "learning_rate": 6.694196933651651e-06, "loss": 0.0007, "num_tokens": 1934788365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6469232738755654, "frac_reward_zero_std": 1.0, "grad_norm": 4.785613824826261e-21, "kl": 0.016387939453125, "learning_rate": 6.688574600903241e-06, "loss": 0.0007, "num_tokens": 1935355533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6470939660322608, "frac_reward_zero_std": 1.0, "grad_norm": 5.155291498946599e-21, "kl": 0.016876220703125, "learning_rate": 6.68295344359131e-06, "loss": 0.0007, "num_tokens": 1935932269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6472646581889562, "frac_reward_zero_std": 1.0, "grad_norm": 4.541368547229317e-21, "kl": 0.01715087890625, "learning_rate": 6.677333463711172e-06, "loss": 0.0007, "num_tokens": 1936504253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6474353503456516, "frac_reward_zero_std": 1.0, "grad_norm": 4.66667276266515e-21, "kl": 0.017486572265625, "learning_rate": 6.671714663257711e-06, "loss": 0.0007, "num_tokens": 1937068877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.647606042502347, "frac_reward_zero_std": 1.0, "grad_norm": 5.236226611966821e-21, "kl": 0.016876220703125, "learning_rate": 6.666097044225399e-06, "loss": 0.0007, "num_tokens": 1937636285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6477767346590424, "frac_reward_zero_std": 1.0, "grad_norm": 5.3492043494517414e-21, "kl": 0.016845703125, "learning_rate": 6.660480608608291e-06, "loss": 0.0007, "num_tokens": 1938200381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6479474268157378, "frac_reward_zero_std": 1.0, "grad_norm": 4.969346227679649e-21, "kl": 0.016815185546875, "learning_rate": 6.654865358400018e-06, "loss": 0.0007, "num_tokens": 1938765597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6481181189724332, "frac_reward_zero_std": 1.0, "grad_norm": 5.021938444553805e-21, "kl": 0.01715087890625, "learning_rate": 6.64925129559379e-06, "loss": 0.0007, "num_tokens": 1939331469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6482888111291286, "frac_reward_zero_std": 1.0, "grad_norm": 4.9245970267179624e-21, "kl": 0.016876220703125, "learning_rate": 6.643638422182393e-06, "loss": 0.0007, "num_tokens": 1939896493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.648459503285824, "frac_reward_zero_std": 1.0, "grad_norm": 4.867039682109683e-21, "kl": 0.017120361328125, "learning_rate": 6.638026740158202e-06, "loss": 0.0007, "num_tokens": 1940465405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6486301954425194, "frac_reward_zero_std": 1.0, "grad_norm": 5.156530609903513e-21, "kl": 0.01690673828125, "learning_rate": 6.6324162515131605e-06, "loss": 0.0007, "num_tokens": 1941031213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6488008875992148, "frac_reward_zero_std": 1.0, "grad_norm": 4.4472857270427116e-21, "kl": 0.016876220703125, "learning_rate": 6.626806958238784e-06, "loss": 0.0007, "num_tokens": 1941596269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6489715797559102, "frac_reward_zero_std": 1.0, "grad_norm": 4.91201973169159e-21, "kl": 0.01727294921875, "learning_rate": 6.62119886232617e-06, "loss": 0.0007, "num_tokens": 1942161277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6491422719126057, "frac_reward_zero_std": 1.0, "grad_norm": 4.468414083385939e-21, "kl": 0.016754150390625, "learning_rate": 6.615591965765994e-06, "loss": 0.0007, "num_tokens": 1942731645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6493129640693011, "frac_reward_zero_std": 1.0, "grad_norm": 4.357234981924432e-21, "kl": 0.017852783203125, "learning_rate": 6.609986270548503e-06, "loss": 0.0007, "num_tokens": 1943292893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6494836562259965, "frac_reward_zero_std": 1.0, "grad_norm": 4.425595537143081e-21, "kl": 0.016845703125, "learning_rate": 6.604381778663509e-06, "loss": 0.0007, "num_tokens": 1943857869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6496543483826919, "frac_reward_zero_std": 1.0, "grad_norm": 5.552731054144486e-21, "kl": 0.01708984375, "learning_rate": 6.59877849210041e-06, "loss": 0.0007, "num_tokens": 1944426685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6498250405393872, "frac_reward_zero_std": 1.0, "grad_norm": 4.903844287996609e-21, "kl": 0.01702880859375, "learning_rate": 6.59317641284817e-06, "loss": 0.0007, "num_tokens": 1944992621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6499957326960826, "frac_reward_zero_std": 1.0, "grad_norm": 4.523641642618465e-21, "kl": 0.016632080078125, "learning_rate": 6.5875755428953255e-06, "loss": 0.0007, "num_tokens": 1945560221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.650166424852778, "frac_reward_zero_std": 1.0, "grad_norm": 5.254829166157146e-21, "kl": 0.017364501953125, "learning_rate": 6.581975884229979e-06, "loss": 0.0007, "num_tokens": 1946129069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6503371170094734, "frac_reward_zero_std": 1.0, "grad_norm": 5.388518562858212e-21, "kl": 0.017486572265625, "learning_rate": 6.576377438839812e-06, "loss": 0.0007, "num_tokens": 1946692797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6505078091661688, "frac_reward_zero_std": 1.0, "grad_norm": 4.7373150131259546e-21, "kl": 0.0169677734375, "learning_rate": 6.570780208712064e-06, "loss": 0.0007, "num_tokens": 1947255085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6506785013228642, "frac_reward_zero_std": 1.0, "grad_norm": 5.394952353526542e-21, "kl": 0.016754150390625, "learning_rate": 6.565184195833562e-06, "loss": 0.0007, "num_tokens": 1947824797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6508491934795596, "frac_reward_zero_std": 1.0, "grad_norm": 5.189376514882033e-21, "kl": 0.01666259765625, "learning_rate": 6.559589402190676e-06, "loss": 0.0007, "num_tokens": 1948391389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.651019885636255, "frac_reward_zero_std": 1.0, "grad_norm": 5.146104673980752e-21, "kl": 0.01654052734375, "learning_rate": 6.553995829769362e-06, "loss": 0.0007, "num_tokens": 1948957309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6511905777929504, "frac_reward_zero_std": 1.0, "grad_norm": 4.7792170240207984e-21, "kl": 0.016937255859375, "learning_rate": 6.5484034805551325e-06, "loss": 0.0007, "num_tokens": 1949520093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6513612699496458, "frac_reward_zero_std": 1.0, "grad_norm": 4.58508408291089e-21, "kl": 0.0167236328125, "learning_rate": 6.542812356533075e-06, "loss": 0.0007, "num_tokens": 1950083533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6515319621063412, "frac_reward_zero_std": 1.0, "grad_norm": 4.531134702435678e-21, "kl": 0.016632080078125, "learning_rate": 6.537222459687832e-06, "loss": 0.0007, "num_tokens": 1950662045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6517026542630366, "frac_reward_zero_std": 1.0, "grad_norm": 5.158587834838777e-21, "kl": 0.017303466796875, "learning_rate": 6.531633792003618e-06, "loss": 0.0007, "num_tokens": 1951225197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.651873346419732, "frac_reward_zero_std": 1.0, "grad_norm": 4.7941628629261464e-21, "kl": 0.016998291015625, "learning_rate": 6.526046355464203e-06, "loss": 0.0007, "num_tokens": 1951789741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6520440385764275, "frac_reward_zero_std": 1.0, "grad_norm": 5.4119311491132094e-21, "kl": 0.0172119140625, "learning_rate": 6.5204601520529344e-06, "loss": 0.0007, "num_tokens": 1952358477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6522147307331229, "frac_reward_zero_std": 1.0, "grad_norm": 5.0917659069802845e-21, "kl": 0.0169677734375, "learning_rate": 6.514875183752704e-06, "loss": 0.0007, "num_tokens": 1952920925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6523854228898183, "frac_reward_zero_std": 1.0, "grad_norm": 4.8891108692483e-21, "kl": 0.01690673828125, "learning_rate": 6.509291452545978e-06, "loss": 0.0007, "num_tokens": 1953488637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6525561150465136, "frac_reward_zero_std": 1.0, "grad_norm": 4.483270312075193e-21, "kl": 0.0169677734375, "learning_rate": 6.503708960414781e-06, "loss": 0.0007, "num_tokens": 1954053645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.652726807203209, "frac_reward_zero_std": 1.0, "grad_norm": 4.589109988619779e-21, "kl": 0.01666259765625, "learning_rate": 6.4981277093406946e-06, "loss": 0.0007, "num_tokens": 1954615853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6528974993599044, "frac_reward_zero_std": 1.0, "grad_norm": 5.417897277251365e-21, "kl": 0.017059326171875, "learning_rate": 6.49254770130486e-06, "loss": 0.0007, "num_tokens": 1955181885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6530681915165998, "frac_reward_zero_std": 1.0, "grad_norm": 5.2044553575959e-21, "kl": 0.016571044921875, "learning_rate": 6.486968938287977e-06, "loss": 0.0007, "num_tokens": 1955745005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6532388836732952, "frac_reward_zero_std": 1.0, "grad_norm": 4.559294778072172e-21, "kl": 0.016632080078125, "learning_rate": 6.481391422270311e-06, "loss": 0.0007, "num_tokens": 1956308573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6534095758299906, "frac_reward_zero_std": 1.0, "grad_norm": 5.596501750957763e-21, "kl": 0.01739501953125, "learning_rate": 6.475815155231677e-06, "loss": 0.0007, "num_tokens": 1956874749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.653580267986686, "frac_reward_zero_std": 1.0, "grad_norm": 5.559810213757837e-21, "kl": 0.016998291015625, "learning_rate": 6.470240139151445e-06, "loss": 0.0007, "num_tokens": 1957442397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6537509601433814, "frac_reward_zero_std": 1.0, "grad_norm": 4.846412114826729e-21, "kl": 0.0174560546875, "learning_rate": 6.464666376008544e-06, "loss": 0.0007, "num_tokens": 1958004157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6539216523000768, "frac_reward_zero_std": 1.0, "grad_norm": 4.711385484081449e-21, "kl": 0.016754150390625, "learning_rate": 6.459093867781463e-06, "loss": 0.0007, "num_tokens": 1958573229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6540923444567722, "frac_reward_zero_std": 1.0, "grad_norm": 5.735814428691908e-21, "kl": 0.017364501953125, "learning_rate": 6.453522616448243e-06, "loss": 0.0007, "num_tokens": 1959137229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6542630366134676, "frac_reward_zero_std": 1.0, "grad_norm": 5.431139147763618e-21, "kl": 0.017425537109375, "learning_rate": 6.447952623986469e-06, "loss": 0.0007, "num_tokens": 1959701181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.654433728770163, "frac_reward_zero_std": 1.0, "grad_norm": 5.020342584333789e-21, "kl": 0.01708984375, "learning_rate": 6.442383892373294e-06, "loss": 0.0007, "num_tokens": 1960267901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6546044209268584, "frac_reward_zero_std": 1.0, "grad_norm": 4.8340357061989675e-21, "kl": 0.016876220703125, "learning_rate": 6.436816423585415e-06, "loss": 0.0007, "num_tokens": 1960831981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6547751130835539, "frac_reward_zero_std": 1.0, "grad_norm": 4.837459004965074e-21, "kl": 0.01654052734375, "learning_rate": 6.431250219599083e-06, "loss": 0.0007, "num_tokens": 1961399981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6549458052402493, "frac_reward_zero_std": 1.0, "grad_norm": 4.973426720475598e-21, "kl": 0.017120361328125, "learning_rate": 6.425685282390096e-06, "loss": 0.0007, "num_tokens": 1961968925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6551164973969447, "frac_reward_zero_std": 1.0, "grad_norm": 5.218880087982014e-21, "kl": 0.01702880859375, "learning_rate": 6.42012161393381e-06, "loss": 0.0007, "num_tokens": 1962533741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.65528718955364, "frac_reward_zero_std": 1.0, "grad_norm": 5.1427266522915944e-21, "kl": 0.017120361328125, "learning_rate": 6.414559216205125e-06, "loss": 0.0007, "num_tokens": 1963099949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6554578817103354, "frac_reward_zero_std": 1.0, "grad_norm": 5.67493128356835e-21, "kl": 0.016937255859375, "learning_rate": 6.408998091178492e-06, "loss": 0.0007, "num_tokens": 1963663421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6556285738670308, "frac_reward_zero_std": 1.0, "grad_norm": 4.93080034016423e-21, "kl": 0.017425537109375, "learning_rate": 6.403438240827906e-06, "loss": 0.0007, "num_tokens": 1964228445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6557992660237262, "frac_reward_zero_std": 1.0, "grad_norm": 5.273974904260778e-21, "kl": 0.01702880859375, "learning_rate": 6.397879667126918e-06, "loss": 0.0007, "num_tokens": 1964794141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6559699581804216, "frac_reward_zero_std": 1.0, "grad_norm": 5.730588839821104e-21, "kl": 0.016876220703125, "learning_rate": 6.392322372048616e-06, "loss": 0.0007, "num_tokens": 1965360541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.656140650337117, "frac_reward_zero_std": 1.0, "grad_norm": 4.856268441139583e-21, "kl": 0.01708984375, "learning_rate": 6.3867663575656505e-06, "loss": 0.0007, "num_tokens": 1965929117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6563113424938124, "frac_reward_zero_std": 1.0, "grad_norm": 5.343921193099658e-21, "kl": 0.01708984375, "learning_rate": 6.38121162565019e-06, "loss": 0.0007, "num_tokens": 1966503245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6564820346505078, "frac_reward_zero_std": 1.0, "grad_norm": 5.648863632418444e-21, "kl": 0.01690673828125, "learning_rate": 6.375658178273973e-06, "loss": 0.0007, "num_tokens": 1967075181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6566527268072032, "frac_reward_zero_std": 1.0, "grad_norm": 4.915296814077838e-21, "kl": 0.01715087890625, "learning_rate": 6.370106017408272e-06, "loss": 0.0007, "num_tokens": 1967637725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6568234189638986, "frac_reward_zero_std": 1.0, "grad_norm": 5.329307242023146e-21, "kl": 0.016937255859375, "learning_rate": 6.364555145023905e-06, "loss": 0.0007, "num_tokens": 1968200077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.656994111120594, "frac_reward_zero_std": 1.0, "grad_norm": 4.6123074662742464e-21, "kl": 0.016937255859375, "learning_rate": 6.359005563091229e-06, "loss": 0.0007, "num_tokens": 1968766269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6571648032772894, "frac_reward_zero_std": 1.0, "grad_norm": 5.398367015725319e-21, "kl": 0.01666259765625, "learning_rate": 6.3534572735801445e-06, "loss": 0.0007, "num_tokens": 1969329661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6573354954339848, "frac_reward_zero_std": 1.0, "grad_norm": 5.000746436164032e-21, "kl": 0.016754150390625, "learning_rate": 6.347910278460094e-06, "loss": 0.0007, "num_tokens": 1969895869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6575061875906802, "frac_reward_zero_std": 1.0, "grad_norm": 5.493313073787753e-21, "kl": 0.016937255859375, "learning_rate": 6.342364579700065e-06, "loss": 0.0007, "num_tokens": 1970459757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6576768797473757, "frac_reward_zero_std": 1.0, "grad_norm": 5.1076356639233134e-21, "kl": 0.017181396484375, "learning_rate": 6.336820179268577e-06, "loss": 0.0007, "num_tokens": 1971025485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6578475719040711, "frac_reward_zero_std": 1.0, "grad_norm": 4.575948472323353e-21, "kl": 0.017120361328125, "learning_rate": 6.331277079133688e-06, "loss": 0.0007, "num_tokens": 1971588269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6580182640607664, "frac_reward_zero_std": 1.0, "grad_norm": 5.35635782258019e-21, "kl": 0.017059326171875, "learning_rate": 6.325735281263006e-06, "loss": 0.0007, "num_tokens": 1972159197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6581889562174618, "frac_reward_zero_std": 1.0, "grad_norm": 4.866884505246415e-21, "kl": 0.016632080078125, "learning_rate": 6.320194787623667e-06, "loss": 0.0007, "num_tokens": 1972731341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6583596483741572, "frac_reward_zero_std": 1.0, "grad_norm": 5.5490219406017315e-21, "kl": 0.016937255859375, "learning_rate": 6.314655600182343e-06, "loss": 0.0007, "num_tokens": 1973296397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6585303405308526, "frac_reward_zero_std": 1.0, "grad_norm": 4.7092376901670466e-21, "kl": 0.017120361328125, "learning_rate": 6.309117720905247e-06, "loss": 0.0007, "num_tokens": 1973869037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.658701032687548, "frac_reward_zero_std": 1.0, "grad_norm": 4.914998665353275e-21, "kl": 0.017120361328125, "learning_rate": 6.303581151758127e-06, "loss": 0.0007, "num_tokens": 1974433437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6588717248442434, "frac_reward_zero_std": 1.0, "grad_norm": 5.36668738263129e-21, "kl": 0.0174560546875, "learning_rate": 6.298045894706268e-06, "loss": 0.0007, "num_tokens": 1975001917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6590424170009388, "frac_reward_zero_std": 1.0, "grad_norm": 4.198681911830852e-21, "kl": 0.01702880859375, "learning_rate": 6.292511951714482e-06, "loss": 0.0007, "num_tokens": 1975571805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6592131091576342, "frac_reward_zero_std": 1.0, "grad_norm": 4.975626033651454e-21, "kl": 0.017425537109375, "learning_rate": 6.286979324747118e-06, "loss": 0.0007, "num_tokens": 1976135613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6593838013143296, "frac_reward_zero_std": 1.0, "grad_norm": 4.201554398644284e-21, "kl": 0.016754150390625, "learning_rate": 6.281448015768063e-06, "loss": 0.0007, "num_tokens": 1976700029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.659554493471025, "frac_reward_zero_std": 1.0, "grad_norm": 4.676441071668044e-21, "kl": 0.016998291015625, "learning_rate": 6.275918026740732e-06, "loss": 0.0007, "num_tokens": 1977262957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6597251856277204, "frac_reward_zero_std": 1.0, "grad_norm": 4.886729827543831e-21, "kl": 0.0172119140625, "learning_rate": 6.2703893596280675e-06, "loss": 0.0007, "num_tokens": 1977832685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6598958777844158, "frac_reward_zero_std": 1.0, "grad_norm": 5.253137066135858e-21, "kl": 0.016754150390625, "learning_rate": 6.26486201639255e-06, "loss": 0.0007, "num_tokens": 1978392797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6600665699411112, "frac_reward_zero_std": 1.0, "grad_norm": 5.409867454137726e-21, "kl": 0.016815185546875, "learning_rate": 6.259335998996185e-06, "loss": 0.0007, "num_tokens": 1978959053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6602372620978066, "frac_reward_zero_std": 1.0, "grad_norm": 4.471330032932326e-21, "kl": 0.01702880859375, "learning_rate": 6.253811309400515e-06, "loss": 0.0007, "num_tokens": 1979522109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.660407954254502, "frac_reward_zero_std": 1.0, "grad_norm": 5.551116417666848e-21, "kl": 0.017181396484375, "learning_rate": 6.248287949566594e-06, "loss": 0.0007, "num_tokens": 1980088957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6605786464111975, "frac_reward_zero_std": 1.0, "grad_norm": 5.217079519720737e-21, "kl": 0.0169677734375, "learning_rate": 6.242765921455025e-06, "loss": 0.0007, "num_tokens": 1980652957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6607493385678928, "frac_reward_zero_std": 1.0, "grad_norm": 5.057572078588498e-21, "kl": 0.01702880859375, "learning_rate": 6.2372452270259235e-06, "loss": 0.0007, "num_tokens": 1981218589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6609200307245882, "frac_reward_zero_std": 1.0, "grad_norm": 5.178522546186768e-21, "kl": 0.01739501953125, "learning_rate": 6.231725868238945e-06, "loss": 0.0007, "num_tokens": 1981784109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6610907228812836, "frac_reward_zero_std": 1.0, "grad_norm": 4.765656751644398e-21, "kl": 0.016845703125, "learning_rate": 6.2262078470532496e-06, "loss": 0.0007, "num_tokens": 1982347709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.661261415037979, "frac_reward_zero_std": 1.0, "grad_norm": 4.529978669009047e-21, "kl": 0.01702880859375, "learning_rate": 6.220691165427544e-06, "loss": 0.0007, "num_tokens": 1982914573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6614321071946744, "frac_reward_zero_std": 1.0, "grad_norm": 5.0497925154786675e-21, "kl": 0.01708984375, "learning_rate": 6.215175825320048e-06, "loss": 0.0007, "num_tokens": 1983486525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6616027993513698, "frac_reward_zero_std": 1.0, "grad_norm": 4.8820453328608115e-21, "kl": 0.016754150390625, "learning_rate": 6.209661828688513e-06, "loss": 0.0007, "num_tokens": 1984052413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6617734915080652, "frac_reward_zero_std": 1.0, "grad_norm": 4.596727940476058e-21, "kl": 0.017059326171875, "learning_rate": 6.2041491774902055e-06, "loss": 0.0007, "num_tokens": 1984616445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6619441836647606, "frac_reward_zero_std": 1.0, "grad_norm": 4.830369349725672e-21, "kl": 0.016845703125, "learning_rate": 6.1986378736819165e-06, "loss": 0.0007, "num_tokens": 1985180621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.662114875821456, "frac_reward_zero_std": 1.0, "grad_norm": 5.3895254905576006e-21, "kl": 0.01690673828125, "learning_rate": 6.1931279192199586e-06, "loss": 0.0007, "num_tokens": 1985741645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6622855679781514, "frac_reward_zero_std": 1.0, "grad_norm": 4.893723218811588e-21, "kl": 0.01690673828125, "learning_rate": 6.187619316060174e-06, "loss": 0.0007, "num_tokens": 1986304605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6624562601348468, "frac_reward_zero_std": 1.0, "grad_norm": 4.662272330753169e-21, "kl": 0.016448974609375, "learning_rate": 6.182112066157911e-06, "loss": 0.0007, "num_tokens": 1986868477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6626269522915422, "frac_reward_zero_std": 1.0, "grad_norm": 5.076458656117566e-21, "kl": 0.017120361328125, "learning_rate": 6.176606171468044e-06, "loss": 0.0007, "num_tokens": 1987432205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6627976444482376, "frac_reward_zero_std": 1.0, "grad_norm": 4.4291592677332594e-21, "kl": 0.016693115234375, "learning_rate": 6.1711016339449715e-06, "loss": 0.0007, "num_tokens": 1987994221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.662968336604933, "frac_reward_zero_std": 1.0, "grad_norm": 4.524056409788106e-21, "kl": 0.0166015625, "learning_rate": 6.165598455542607e-06, "loss": 0.0007, "num_tokens": 1988555453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6631390287616284, "frac_reward_zero_std": 1.0, "grad_norm": 4.685231842725481e-21, "kl": 0.01715087890625, "learning_rate": 6.1600966382143734e-06, "loss": 0.0007, "num_tokens": 1989124877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6633097209183239, "frac_reward_zero_std": 1.0, "grad_norm": 4.308143328988012e-21, "kl": 0.016845703125, "learning_rate": 6.15459618391322e-06, "loss": 0.0007, "num_tokens": 1989693533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6634804130750191, "frac_reward_zero_std": 1.0, "grad_norm": 4.77291468508773e-21, "kl": 0.0169677734375, "learning_rate": 6.149097094591611e-06, "loss": 0.0007, "num_tokens": 1990254941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6636511052317146, "frac_reward_zero_std": 1.0, "grad_norm": 4.214128219245344e-21, "kl": 0.01666259765625, "learning_rate": 6.143599372201525e-06, "loss": 0.0007, "num_tokens": 1990818173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.66382179738841, "frac_reward_zero_std": 1.0, "grad_norm": 4.393990242155261e-21, "kl": 0.016693115234375, "learning_rate": 6.138103018694455e-06, "loss": 0.0007, "num_tokens": 1991381373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6639924895451054, "frac_reward_zero_std": 1.0, "grad_norm": 4.737735072469092e-21, "kl": 0.01715087890625, "learning_rate": 6.1326080360214014e-06, "loss": 0.0007, "num_tokens": 1991948157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6641631817018008, "frac_reward_zero_std": 1.0, "grad_norm": 5.242999583695632e-21, "kl": 0.01727294921875, "learning_rate": 6.127114426132894e-06, "loss": 0.0007, "num_tokens": 1992516941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6643338738584962, "frac_reward_zero_std": 1.0, "grad_norm": 4.76338182877097e-21, "kl": 0.01678466796875, "learning_rate": 6.1216221909789645e-06, "loss": 0.0007, "num_tokens": 1993083517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6645045660151916, "frac_reward_zero_std": 1.0, "grad_norm": 4.597177100872727e-21, "kl": 0.01666259765625, "learning_rate": 6.1161313325091535e-06, "loss": 0.0007, "num_tokens": 1993645581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.664675258171887, "frac_reward_zero_std": 1.0, "grad_norm": 4.8334680386186624e-21, "kl": 0.01678466796875, "learning_rate": 6.110641852672516e-06, "loss": 0.0007, "num_tokens": 1994222877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6648459503285824, "frac_reward_zero_std": 1.0, "grad_norm": 5.275745086635145e-21, "kl": 0.017242431640625, "learning_rate": 6.105153753417628e-06, "loss": 0.0007, "num_tokens": 1994786877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6650166424852778, "frac_reward_zero_std": 1.0, "grad_norm": 4.874378816269634e-21, "kl": 0.016937255859375, "learning_rate": 6.099667036692563e-06, "loss": 0.0007, "num_tokens": 1995354589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6651873346419732, "frac_reward_zero_std": 1.0, "grad_norm": 4.744945074556665e-21, "kl": 0.017242431640625, "learning_rate": 6.0941817044448995e-06, "loss": 0.0007, "num_tokens": 1995921309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6653580267986686, "frac_reward_zero_std": 1.0, "grad_norm": 5.308516112937788e-21, "kl": 0.017059326171875, "learning_rate": 6.088697758621743e-06, "loss": 0.0007, "num_tokens": 1996486957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.665528718955364, "frac_reward_zero_std": 1.0, "grad_norm": 5.3241233668127544e-21, "kl": 0.0167236328125, "learning_rate": 6.083215201169692e-06, "loss": 0.0007, "num_tokens": 1997053901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6656994111120594, "frac_reward_zero_std": 1.0, "grad_norm": 4.893400992047244e-21, "kl": 0.01727294921875, "learning_rate": 6.077734034034859e-06, "loss": 0.0007, "num_tokens": 1997623581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6658701032687548, "frac_reward_zero_std": 1.0, "grad_norm": 5.081277569033108e-21, "kl": 0.016876220703125, "learning_rate": 6.0722542591628544e-06, "loss": 0.0007, "num_tokens": 1998200445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6660407954254502, "frac_reward_zero_std": 1.0, "grad_norm": 4.6429735803653115e-21, "kl": 0.0169677734375, "learning_rate": 6.066775878498805e-06, "loss": 0.0007, "num_tokens": 1998761917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6662114875821455, "frac_reward_zero_std": 1.0, "grad_norm": 5.2689989578888015e-21, "kl": 0.0167236328125, "learning_rate": 6.061298893987337e-06, "loss": 0.0007, "num_tokens": 1999339597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.666382179738841, "frac_reward_zero_std": 1.0, "grad_norm": 4.448542612307494e-21, "kl": 0.01678466796875, "learning_rate": 6.055823307572588e-06, "loss": 0.0007, "num_tokens": 1999903373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6665528718955364, "frac_reward_zero_std": 1.0, "grad_norm": 5.205374579483255e-21, "kl": 0.017333984375, "learning_rate": 6.050349121198182e-06, "loss": 0.0007, "num_tokens": 2000469901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6667235640522318, "frac_reward_zero_std": 1.0, "grad_norm": 4.712029102417252e-21, "kl": 0.017181396484375, "learning_rate": 6.044876336807267e-06, "loss": 0.0007, "num_tokens": 2001035677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6668942562089272, "frac_reward_zero_std": 1.0, "grad_norm": 5.047262848433478e-21, "kl": 0.016754150390625, "learning_rate": 6.039404956342478e-06, "loss": 0.0007, "num_tokens": 2001601805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6670649483656226, "frac_reward_zero_std": 1.0, "grad_norm": 4.9140721025801295e-21, "kl": 0.017578125, "learning_rate": 6.033934981745964e-06, "loss": 0.0007, "num_tokens": 2002176173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.667235640522318, "frac_reward_zero_std": 1.0, "grad_norm": 5.3111194449247535e-21, "kl": 0.017181396484375, "learning_rate": 6.028466414959362e-06, "loss": 0.0007, "num_tokens": 2002741789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6674063326790134, "frac_reward_zero_std": 1.0, "grad_norm": 5.2479007232827664e-21, "kl": 0.017181396484375, "learning_rate": 6.022999257923821e-06, "loss": 0.0007, "num_tokens": 2003308109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6675770248357088, "frac_reward_zero_std": 1.0, "grad_norm": 4.8254314814856154e-21, "kl": 0.01678466796875, "learning_rate": 6.017533512579977e-06, "loss": 0.0007, "num_tokens": 2003873245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6677477169924042, "frac_reward_zero_std": 1.0, "grad_norm": 5.072145192138248e-21, "kl": 0.016693115234375, "learning_rate": 6.0120691808679835e-06, "loss": 0.0007, "num_tokens": 2004439645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6679184091490996, "frac_reward_zero_std": 1.0, "grad_norm": 4.469297749247406e-21, "kl": 0.0167236328125, "learning_rate": 6.006606264727472e-06, "loss": 0.0007, "num_tokens": 2005006765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.668089101305795, "frac_reward_zero_std": 1.0, "grad_norm": 5.170482370354679e-21, "kl": 0.016998291015625, "learning_rate": 6.00114476609758e-06, "loss": 0.0007, "num_tokens": 2005571229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6682597934624904, "frac_reward_zero_std": 1.0, "grad_norm": 5.309070835524667e-21, "kl": 0.016845703125, "learning_rate": 5.995684686916949e-06, "loss": 0.0007, "num_tokens": 2006134013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6684304856191858, "frac_reward_zero_std": 1.0, "grad_norm": 4.364319737649781e-21, "kl": 0.01727294921875, "learning_rate": 5.990226029123709e-06, "loss": 0.0007, "num_tokens": 2006698221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6686011777758812, "frac_reward_zero_std": 1.0, "grad_norm": 5.140016041145289e-21, "kl": 0.017059326171875, "learning_rate": 5.9847687946554815e-06, "loss": 0.0007, "num_tokens": 2007259517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6687718699325766, "frac_reward_zero_std": 1.0, "grad_norm": 4.881588037616097e-21, "kl": 0.017364501953125, "learning_rate": 5.979312985449388e-06, "loss": 0.0007, "num_tokens": 2007825229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.668942562089272, "frac_reward_zero_std": 1.0, "grad_norm": 4.338759891571872e-21, "kl": 0.01666259765625, "learning_rate": 5.973858603442049e-06, "loss": 0.0007, "num_tokens": 2008386413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6691132542459673, "frac_reward_zero_std": 1.0, "grad_norm": 5.146451320419456e-21, "kl": 0.01678466796875, "learning_rate": 5.968405650569572e-06, "loss": 0.0007, "num_tokens": 2008967517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6692839464026628, "frac_reward_zero_std": 1.0, "grad_norm": 5.2647068274002204e-21, "kl": 0.017120361328125, "learning_rate": 5.962954128767555e-06, "loss": 0.0007, "num_tokens": 2009536781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6694546385593582, "frac_reward_zero_std": 1.0, "grad_norm": 5.249535436608738e-21, "kl": 0.0169677734375, "learning_rate": 5.957504039971091e-06, "loss": 0.0007, "num_tokens": 2010100925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6696253307160536, "frac_reward_zero_std": 1.0, "grad_norm": 4.606685068322693e-21, "kl": 0.01654052734375, "learning_rate": 5.952055386114769e-06, "loss": 0.0007, "num_tokens": 2010663933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.669796022872749, "frac_reward_zero_std": 1.0, "grad_norm": 5.218379511144397e-21, "kl": 0.016937255859375, "learning_rate": 5.946608169132665e-06, "loss": 0.0007, "num_tokens": 2011229037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6699667150294444, "frac_reward_zero_std": 1.0, "grad_norm": 4.969842289769326e-21, "kl": 0.017181396484375, "learning_rate": 5.9411623909583374e-06, "loss": 0.0007, "num_tokens": 2011794157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6701374071861398, "frac_reward_zero_std": 1.0, "grad_norm": 4.454094885471139e-21, "kl": 0.016632080078125, "learning_rate": 5.935718053524848e-06, "loss": 0.0007, "num_tokens": 2012357981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6703080993428352, "frac_reward_zero_std": 1.0, "grad_norm": 4.859795388612645e-21, "kl": 0.016876220703125, "learning_rate": 5.9302751587647374e-06, "loss": 0.0007, "num_tokens": 2012922429.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6704787914995306, "frac_reward_zero_std": 1.0, "grad_norm": 4.776986118559409e-21, "kl": 0.0169677734375, "learning_rate": 5.9248337086100385e-06, "loss": 0.0007, "num_tokens": 2013490285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.670649483656226, "frac_reward_zero_std": 1.0, "grad_norm": 4.8051415332550306e-21, "kl": 0.016815185546875, "learning_rate": 5.919393704992265e-06, "loss": 0.0007, "num_tokens": 2014057053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6708201758129214, "frac_reward_zero_std": 1.0, "grad_norm": 4.458268267804978e-21, "kl": 0.0162353515625, "learning_rate": 5.913955149842425e-06, "loss": 0.0006, "num_tokens": 2014629485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6709908679696168, "frac_reward_zero_std": 1.0, "grad_norm": 5.058670397922352e-21, "kl": 0.017303466796875, "learning_rate": 5.908518045091007e-06, "loss": 0.0007, "num_tokens": 2015190189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6711615601263122, "frac_reward_zero_std": 1.0, "grad_norm": 4.713999705637622e-21, "kl": 0.01727294921875, "learning_rate": 5.9030823926679955e-06, "loss": 0.0007, "num_tokens": 2015754717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6713322522830076, "frac_reward_zero_std": 1.0, "grad_norm": 4.887886440958989e-21, "kl": 0.017120361328125, "learning_rate": 5.89764819450284e-06, "loss": 0.0007, "num_tokens": 2016321421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.671502944439703, "frac_reward_zero_std": 1.0, "grad_norm": 4.71585490506377e-21, "kl": 0.016937255859375, "learning_rate": 5.89221545252449e-06, "loss": 0.0007, "num_tokens": 2016888141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6716736365963984, "frac_reward_zero_std": 1.0, "grad_norm": 4.167893049581358e-21, "kl": 0.0166015625, "learning_rate": 5.8867841686613695e-06, "loss": 0.0007, "num_tokens": 2017452205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6718443287530937, "frac_reward_zero_std": 1.0, "grad_norm": 5.475508470429995e-21, "kl": 0.0174560546875, "learning_rate": 5.8813543448413965e-06, "loss": 0.0007, "num_tokens": 2018016957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6720150209097892, "frac_reward_zero_std": 1.0, "grad_norm": 5.32211070730778e-21, "kl": 0.01727294921875, "learning_rate": 5.8759259829919505e-06, "loss": 0.0007, "num_tokens": 2018585885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6721857130664846, "frac_reward_zero_std": 1.0, "grad_norm": 4.448513864637837e-21, "kl": 0.01666259765625, "learning_rate": 5.870499085039912e-06, "loss": 0.0007, "num_tokens": 2019149693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.67235640522318, "frac_reward_zero_std": 1.0, "grad_norm": 4.597802342763782e-21, "kl": 0.016876220703125, "learning_rate": 5.865073652911631e-06, "loss": 0.0007, "num_tokens": 2019720717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6725270973798754, "frac_reward_zero_std": 1.0, "grad_norm": 4.153122451471255e-21, "kl": 0.016998291015625, "learning_rate": 5.859649688532943e-06, "loss": 0.0007, "num_tokens": 2020289133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6726977895365708, "frac_reward_zero_std": 1.0, "grad_norm": 4.16765910885298e-21, "kl": 0.01654052734375, "learning_rate": 5.854227193829158e-06, "loss": 0.0007, "num_tokens": 2020849293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6728684816932662, "frac_reward_zero_std": 1.0, "grad_norm": 4.440299782395092e-21, "kl": 0.0177001953125, "learning_rate": 5.8488061707250635e-06, "loss": 0.0007, "num_tokens": 2021413133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6730391738499616, "frac_reward_zero_std": 1.0, "grad_norm": 5.027741537927262e-21, "kl": 0.01727294921875, "learning_rate": 5.843386621144927e-06, "loss": 0.0007, "num_tokens": 2021982733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.673209866006657, "frac_reward_zero_std": 1.0, "grad_norm": 4.476220364811676e-21, "kl": 0.017608642578125, "learning_rate": 5.8379685470125016e-06, "loss": 0.0007, "num_tokens": 2022549549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6733805581633524, "frac_reward_zero_std": 1.0, "grad_norm": 5.5065277695415134e-21, "kl": 0.01702880859375, "learning_rate": 5.832551950250996e-06, "loss": 0.0007, "num_tokens": 2023116493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6735512503200478, "frac_reward_zero_std": 1.0, "grad_norm": 4.4742667659683424e-21, "kl": 0.016845703125, "learning_rate": 5.827136832783109e-06, "loss": 0.0007, "num_tokens": 2023683037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6737219424767432, "frac_reward_zero_std": 1.0, "grad_norm": 4.795627743420086e-21, "kl": 0.016998291015625, "learning_rate": 5.821723196531017e-06, "loss": 0.0007, "num_tokens": 2024252365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6738926346334386, "frac_reward_zero_std": 1.0, "grad_norm": 5.208597543250582e-21, "kl": 0.017181396484375, "learning_rate": 5.816311043416371e-06, "loss": 0.0007, "num_tokens": 2024823565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.674063326790134, "frac_reward_zero_std": 1.0, "grad_norm": 4.535405207187083e-21, "kl": 0.01690673828125, "learning_rate": 5.8109003753602735e-06, "loss": 0.0007, "num_tokens": 2025391213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6742340189468294, "frac_reward_zero_std": 1.0, "grad_norm": 5.0355549062594615e-21, "kl": 0.017181396484375, "learning_rate": 5.805491194283325e-06, "loss": 0.0007, "num_tokens": 2025960637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6744047111035248, "frac_reward_zero_std": 1.0, "grad_norm": 4.382926015588368e-21, "kl": 0.017242431640625, "learning_rate": 5.800083502105589e-06, "loss": 0.0007, "num_tokens": 2026528269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6745754032602201, "frac_reward_zero_std": 1.0, "grad_norm": 5.346854111333775e-21, "kl": 0.016815185546875, "learning_rate": 5.794677300746606e-06, "loss": 0.0007, "num_tokens": 2027103037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6747460954169155, "frac_reward_zero_std": 1.0, "grad_norm": 4.9320634139517075e-21, "kl": 0.01727294921875, "learning_rate": 5.789272592125378e-06, "loss": 0.0007, "num_tokens": 2027668797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.674916787573611, "frac_reward_zero_std": 1.0, "grad_norm": 4.9225276602223766e-21, "kl": 0.01666259765625, "learning_rate": 5.7838693781603775e-06, "loss": 0.0007, "num_tokens": 2028233709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6750874797303064, "frac_reward_zero_std": 1.0, "grad_norm": 4.7776995777231104e-21, "kl": 0.01690673828125, "learning_rate": 5.778467660769553e-06, "loss": 0.0007, "num_tokens": 2028799565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6752581718870018, "frac_reward_zero_std": 1.0, "grad_norm": 4.332444902672122e-21, "kl": 0.016876220703125, "learning_rate": 5.773067441870326e-06, "loss": 0.0007, "num_tokens": 2029362429.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6754288640436972, "frac_reward_zero_std": 1.0, "grad_norm": 4.682872484905323e-21, "kl": 0.01666259765625, "learning_rate": 5.7676687233795716e-06, "loss": 0.0007, "num_tokens": 2029925741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6755995562003926, "frac_reward_zero_std": 1.0, "grad_norm": 4.699611900542588e-21, "kl": 0.01727294921875, "learning_rate": 5.7622715072136395e-06, "loss": 0.0007, "num_tokens": 2030486093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.675770248357088, "frac_reward_zero_std": 1.0, "grad_norm": 4.581371415411413e-21, "kl": 0.016754150390625, "learning_rate": 5.756875795288348e-06, "loss": 0.0007, "num_tokens": 2031048365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6759409405137834, "frac_reward_zero_std": 1.0, "grad_norm": 4.498200628585333e-21, "kl": 0.017242431640625, "learning_rate": 5.751481589518987e-06, "loss": 0.0007, "num_tokens": 2031614989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6761116326704788, "frac_reward_zero_std": 1.0, "grad_norm": 4.701803954746412e-21, "kl": 0.01702880859375, "learning_rate": 5.7460888918202984e-06, "loss": 0.0007, "num_tokens": 2032185917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6762823248271742, "frac_reward_zero_std": 1.0, "grad_norm": 5.3726595739129495e-21, "kl": 0.016632080078125, "learning_rate": 5.740697704106491e-06, "loss": 0.0007, "num_tokens": 2032762925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6764530169838696, "frac_reward_zero_std": 1.0, "grad_norm": 4.465289461749414e-21, "kl": 0.01678466796875, "learning_rate": 5.735308028291246e-06, "loss": 0.0007, "num_tokens": 2033325741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.676623709140565, "frac_reward_zero_std": 1.0, "grad_norm": 4.656251821789086e-21, "kl": 0.017425537109375, "learning_rate": 5.72991986628771e-06, "loss": 0.0007, "num_tokens": 2033889757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6767944012972604, "frac_reward_zero_std": 1.0, "grad_norm": 4.7098622880535046e-21, "kl": 0.016387939453125, "learning_rate": 5.72453322000848e-06, "loss": 0.0007, "num_tokens": 2034459101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6769650934539558, "frac_reward_zero_std": 1.0, "grad_norm": 5.255923927919934e-21, "kl": 0.017303466796875, "learning_rate": 5.719148091365618e-06, "loss": 0.0007, "num_tokens": 2035028653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6771357856106512, "frac_reward_zero_std": 1.0, "grad_norm": 4.9071987709356135e-21, "kl": 0.016937255859375, "learning_rate": 5.7137644822706515e-06, "loss": 0.0007, "num_tokens": 2035591405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6773064777673465, "frac_reward_zero_std": 1.0, "grad_norm": 4.844509234054827e-21, "kl": 0.0172119140625, "learning_rate": 5.7083823946345765e-06, "loss": 0.0007, "num_tokens": 2036157581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6774771699240419, "frac_reward_zero_std": 1.0, "grad_norm": 4.36087008464111e-21, "kl": 0.017547607421875, "learning_rate": 5.703001830367831e-06, "loss": 0.0007, "num_tokens": 2036724477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6776478620807374, "frac_reward_zero_std": 1.0, "grad_norm": 4.479965344785203e-21, "kl": 0.01739501953125, "learning_rate": 5.697622791380319e-06, "loss": 0.0007, "num_tokens": 2037287645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6778185542374328, "frac_reward_zero_std": 1.0, "grad_norm": 5.319124131107778e-21, "kl": 0.016998291015625, "learning_rate": 5.69224527958141e-06, "loss": 0.0007, "num_tokens": 2037855645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6779892463941282, "frac_reward_zero_std": 1.0, "grad_norm": 5.070340075451124e-21, "kl": 0.017425537109375, "learning_rate": 5.6868692968799285e-06, "loss": 0.0007, "num_tokens": 2038425965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6781599385508236, "frac_reward_zero_std": 1.0, "grad_norm": 4.379736134258267e-21, "kl": 0.01708984375, "learning_rate": 5.6814948451841504e-06, "loss": 0.0007, "num_tokens": 2038993501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.678330630707519, "frac_reward_zero_std": 1.0, "grad_norm": 5.025024788009156e-21, "kl": 0.016876220703125, "learning_rate": 5.6761219264018145e-06, "loss": 0.0007, "num_tokens": 2039561437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6785013228642144, "frac_reward_zero_std": 1.0, "grad_norm": 4.880964852691061e-21, "kl": 0.0167236328125, "learning_rate": 5.670750542440108e-06, "loss": 0.0007, "num_tokens": 2040128701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6786720150209098, "frac_reward_zero_std": 1.0, "grad_norm": 4.986160684779683e-21, "kl": 0.01702880859375, "learning_rate": 5.665380695205687e-06, "loss": 0.0007, "num_tokens": 2040692381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6788427071776052, "frac_reward_zero_std": 1.0, "grad_norm": 5.309501633800157e-21, "kl": 0.017181396484375, "learning_rate": 5.660012386604643e-06, "loss": 0.0007, "num_tokens": 2041257181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6790133993343006, "frac_reward_zero_std": 1.0, "grad_norm": 5.15225913720219e-21, "kl": 0.017120361328125, "learning_rate": 5.6546456185425425e-06, "loss": 0.0007, "num_tokens": 2041824493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.679184091490996, "frac_reward_zero_std": 1.0, "grad_norm": 4.551395197126564e-21, "kl": 0.016632080078125, "learning_rate": 5.649280392924384e-06, "loss": 0.0007, "num_tokens": 2042393533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6793547836476914, "frac_reward_zero_std": 1.0, "grad_norm": 5.081360217141557e-21, "kl": 0.016998291015625, "learning_rate": 5.643916711654639e-06, "loss": 0.0007, "num_tokens": 2042962029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6795254758043868, "frac_reward_zero_std": 1.0, "grad_norm": 4.6874468578851776e-21, "kl": 0.016937255859375, "learning_rate": 5.638554576637212e-06, "loss": 0.0007, "num_tokens": 2043527437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6796961679610822, "frac_reward_zero_std": 1.0, "grad_norm": 5.48509810063508e-21, "kl": 0.016998291015625, "learning_rate": 5.633193989775474e-06, "loss": 0.0007, "num_tokens": 2044096989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6798668601177776, "frac_reward_zero_std": 1.0, "grad_norm": 4.6838822494759294e-21, "kl": 0.01727294921875, "learning_rate": 5.627834952972233e-06, "loss": 0.0007, "num_tokens": 2044660653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6800375522744729, "frac_reward_zero_std": 1.0, "grad_norm": 5.172617082535318e-21, "kl": 0.0167236328125, "learning_rate": 5.622477468129764e-06, "loss": 0.0007, "num_tokens": 2045221821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6802082444311683, "frac_reward_zero_std": 1.0, "grad_norm": 4.835361971441402e-21, "kl": 0.016845703125, "learning_rate": 5.617121537149768e-06, "loss": 0.0007, "num_tokens": 2045793821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6803789365878637, "frac_reward_zero_std": 1.0, "grad_norm": 5.306701877824068e-21, "kl": 0.016693115234375, "learning_rate": 5.611767161933417e-06, "loss": 0.0007, "num_tokens": 2046355853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6805496287445592, "frac_reward_zero_std": 1.0, "grad_norm": 4.847558817395178e-21, "kl": 0.017730712890625, "learning_rate": 5.6064143443813145e-06, "loss": 0.0007, "num_tokens": 2046924365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6807203209012546, "frac_reward_zero_std": 1.0, "grad_norm": 4.539470014253424e-21, "kl": 0.017242431640625, "learning_rate": 5.6010630863935245e-06, "loss": 0.0007, "num_tokens": 2047492381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.68089101305795, "frac_reward_zero_std": 1.0, "grad_norm": 5.230287249608044e-21, "kl": 0.01739501953125, "learning_rate": 5.595713389869542e-06, "loss": 0.0007, "num_tokens": 2048057469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6810617052146454, "frac_reward_zero_std": 1.0, "grad_norm": 4.744028340995569e-21, "kl": 0.017242431640625, "learning_rate": 5.590365256708321e-06, "loss": 0.0007, "num_tokens": 2048620749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6812323973713408, "frac_reward_zero_std": 1.0, "grad_norm": 4.801649452214612e-21, "kl": 0.01666259765625, "learning_rate": 5.585018688808259e-06, "loss": 0.0007, "num_tokens": 2049189549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6814030895280362, "frac_reward_zero_std": 1.0, "grad_norm": 5.3449765625520504e-21, "kl": 0.016693115234375, "learning_rate": 5.579673688067191e-06, "loss": 0.0007, "num_tokens": 2049755901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6815737816847316, "frac_reward_zero_std": 1.0, "grad_norm": 4.268054083362896e-21, "kl": 0.016815185546875, "learning_rate": 5.5743302563823945e-06, "loss": 0.0007, "num_tokens": 2050320365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.681744473841427, "frac_reward_zero_std": 1.0, "grad_norm": 4.736466279909717e-21, "kl": 0.01715087890625, "learning_rate": 5.5689883956506e-06, "loss": 0.0007, "num_tokens": 2050885773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6819151659981224, "frac_reward_zero_std": 1.0, "grad_norm": 4.885369083729316e-21, "kl": 0.01751708984375, "learning_rate": 5.563648107767978e-06, "loss": 0.0007, "num_tokens": 2051449069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6820858581548178, "frac_reward_zero_std": 1.0, "grad_norm": 4.81321399789555e-21, "kl": 0.017486572265625, "learning_rate": 5.558309394630135e-06, "loss": 0.0007, "num_tokens": 2052008781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6822565503115132, "frac_reward_zero_std": 1.0, "grad_norm": 4.279714546238199e-21, "kl": 0.017059326171875, "learning_rate": 5.552972258132117e-06, "loss": 0.0007, "num_tokens": 2052569309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6824272424682086, "frac_reward_zero_std": 1.0, "grad_norm": 4.710896136814313e-21, "kl": 0.016876220703125, "learning_rate": 5.5476367001684196e-06, "loss": 0.0007, "num_tokens": 2053143229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.682597934624904, "frac_reward_zero_std": 1.0, "grad_norm": 4.921545259316891e-21, "kl": 0.017425537109375, "learning_rate": 5.542302722632975e-06, "loss": 0.0007, "num_tokens": 2053723869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6827686267815993, "frac_reward_zero_std": 1.0, "grad_norm": 4.8856751937059774e-21, "kl": 0.016845703125, "learning_rate": 5.536970327419151e-06, "loss": 0.0007, "num_tokens": 2054285581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6829393189382947, "frac_reward_zero_std": 1.0, "grad_norm": 5.066806402222493e-21, "kl": 0.017608642578125, "learning_rate": 5.531639516419751e-06, "loss": 0.0007, "num_tokens": 2054855885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6831100110949901, "frac_reward_zero_std": 1.0, "grad_norm": 4.7051639923443954e-21, "kl": 0.017181396484375, "learning_rate": 5.526310291527024e-06, "loss": 0.0007, "num_tokens": 2055423965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6832807032516856, "frac_reward_zero_std": 1.0, "grad_norm": 5.380770275982974e-21, "kl": 0.0177001953125, "learning_rate": 5.5209826546326575e-06, "loss": 0.0007, "num_tokens": 2055987805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.683451395408381, "frac_reward_zero_std": 1.0, "grad_norm": 5.2262921556821494e-21, "kl": 0.017364501953125, "learning_rate": 5.515656607627764e-06, "loss": 0.0007, "num_tokens": 2056558493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6836220875650764, "frac_reward_zero_std": 1.0, "grad_norm": 4.924852616647066e-21, "kl": 0.016754150390625, "learning_rate": 5.510332152402896e-06, "loss": 0.0007, "num_tokens": 2057126157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6837927797217718, "frac_reward_zero_std": 1.0, "grad_norm": 4.845076931347991e-21, "kl": 0.01702880859375, "learning_rate": 5.505009290848047e-06, "loss": 0.0007, "num_tokens": 2057692221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6839634718784672, "frac_reward_zero_std": 1.0, "grad_norm": 4.499355949517389e-21, "kl": 0.017181396484375, "learning_rate": 5.499688024852642e-06, "loss": 0.0007, "num_tokens": 2058259213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6841341640351626, "frac_reward_zero_std": 1.0, "grad_norm": 4.421454089830474e-21, "kl": 0.016815185546875, "learning_rate": 5.494368356305537e-06, "loss": 0.0007, "num_tokens": 2058826397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.684304856191858, "frac_reward_zero_std": 1.0, "grad_norm": 4.41256839700683e-21, "kl": 0.016754150390625, "learning_rate": 5.489050287095017e-06, "loss": 0.0007, "num_tokens": 2059395661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6844755483485534, "frac_reward_zero_std": 1.0, "grad_norm": 5.00503685451621e-21, "kl": 0.01678466796875, "learning_rate": 5.4837338191088095e-06, "loss": 0.0007, "num_tokens": 2059975645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6846462405052488, "frac_reward_zero_std": 1.0, "grad_norm": 4.647760944088728e-21, "kl": 0.01708984375, "learning_rate": 5.478418954234069e-06, "loss": 0.0007, "num_tokens": 2060541981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6848169326619442, "frac_reward_zero_std": 1.0, "grad_norm": 4.4556925328013914e-21, "kl": 0.016845703125, "learning_rate": 5.473105694357388e-06, "loss": 0.0007, "num_tokens": 2061106925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6849876248186396, "frac_reward_zero_std": 1.0, "grad_norm": 4.69727219983666e-21, "kl": 0.016937255859375, "learning_rate": 5.467794041364766e-06, "loss": 0.0007, "num_tokens": 2061680317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.685158316975335, "frac_reward_zero_std": 1.0, "grad_norm": 4.739535626426508e-21, "kl": 0.017425537109375, "learning_rate": 5.462483997141655e-06, "loss": 0.0007, "num_tokens": 2062244525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6853290091320304, "frac_reward_zero_std": 1.0, "grad_norm": 5.913031493415054e-21, "kl": 0.017547607421875, "learning_rate": 5.457175563572932e-06, "loss": 0.0007, "num_tokens": 2062811565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6854997012887258, "frac_reward_zero_std": 1.0, "grad_norm": 4.49941285235002e-21, "kl": 0.017303466796875, "learning_rate": 5.4518687425429e-06, "loss": 0.0007, "num_tokens": 2063374749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6856703934454211, "frac_reward_zero_std": 1.0, "grad_norm": 4.378734649809857e-21, "kl": 0.0174560546875, "learning_rate": 5.446563535935286e-06, "loss": 0.0007, "num_tokens": 2063946973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6858410856021165, "frac_reward_zero_std": 1.0, "grad_norm": 4.496651484470724e-21, "kl": 0.01702880859375, "learning_rate": 5.441259945633244e-06, "loss": 0.0007, "num_tokens": 2064524989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.686011777758812, "frac_reward_zero_std": 1.0, "grad_norm": 4.77149800817848e-21, "kl": 0.01678466796875, "learning_rate": 5.43595797351936e-06, "loss": 0.0007, "num_tokens": 2065093309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6861824699155074, "frac_reward_zero_std": 1.0, "grad_norm": 4.477106639488346e-21, "kl": 0.017181396484375, "learning_rate": 5.430657621475647e-06, "loss": 0.0007, "num_tokens": 2065659949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6863531620722028, "frac_reward_zero_std": 1.0, "grad_norm": 5.112815488840418e-21, "kl": 0.01702880859375, "learning_rate": 5.425358891383535e-06, "loss": 0.0007, "num_tokens": 2066225197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6865238542288982, "frac_reward_zero_std": 1.0, "grad_norm": 4.824148795286695e-21, "kl": 0.01727294921875, "learning_rate": 5.420061785123876e-06, "loss": 0.0007, "num_tokens": 2066794429.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6866945463855936, "frac_reward_zero_std": 1.0, "grad_norm": 5.591984230734139e-21, "kl": 0.017242431640625, "learning_rate": 5.414766304576958e-06, "loss": 0.0007, "num_tokens": 2067357261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.686865238542289, "frac_reward_zero_std": 1.0, "grad_norm": 5.3056707448661445e-21, "kl": 0.016937255859375, "learning_rate": 5.4094724516224885e-06, "loss": 0.0007, "num_tokens": 2067921917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6870359306989844, "frac_reward_zero_std": 1.0, "grad_norm": 4.912992431411818e-21, "kl": 0.016937255859375, "learning_rate": 5.404180228139591e-06, "loss": 0.0007, "num_tokens": 2068493629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6872066228556798, "frac_reward_zero_std": 1.0, "grad_norm": 4.502013100314358e-21, "kl": 0.016693115234375, "learning_rate": 5.398889636006808e-06, "loss": 0.0007, "num_tokens": 2069062941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6873773150123752, "frac_reward_zero_std": 1.0, "grad_norm": 4.754634851415691e-21, "kl": 0.016998291015625, "learning_rate": 5.393600677102113e-06, "loss": 0.0007, "num_tokens": 2069636797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6875480071690706, "frac_reward_zero_std": 1.0, "grad_norm": 4.3967769132410155e-21, "kl": 0.01654052734375, "learning_rate": 5.388313353302901e-06, "loss": 0.0007, "num_tokens": 2070196781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.687718699325766, "frac_reward_zero_std": 1.0, "grad_norm": 4.729173155878435e-21, "kl": 0.01727294921875, "learning_rate": 5.383027666485976e-06, "loss": 0.0007, "num_tokens": 2070758525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6878893914824614, "frac_reward_zero_std": 1.0, "grad_norm": 4.658444560451803e-21, "kl": 0.01641845703125, "learning_rate": 5.3777436185275624e-06, "loss": 0.0007, "num_tokens": 2071320781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6880600836391568, "frac_reward_zero_std": 1.0, "grad_norm": 4.7566941330883836e-21, "kl": 0.0167236328125, "learning_rate": 5.3724612113033105e-06, "loss": 0.0007, "num_tokens": 2071889181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6882307757958522, "frac_reward_zero_std": 1.0, "grad_norm": 5.122045992384762e-21, "kl": 0.01715087890625, "learning_rate": 5.367180446688287e-06, "loss": 0.0007, "num_tokens": 2072456717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6884014679525475, "frac_reward_zero_std": 1.0, "grad_norm": 4.756647712999853e-21, "kl": 0.017242431640625, "learning_rate": 5.361901326556967e-06, "loss": 0.0007, "num_tokens": 2073022925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6885721601092429, "frac_reward_zero_std": 1.0, "grad_norm": 5.074841807059152e-21, "kl": 0.017608642578125, "learning_rate": 5.356623852783253e-06, "loss": 0.0007, "num_tokens": 2073589677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6887428522659383, "frac_reward_zero_std": 1.0, "grad_norm": 4.3858589730285854e-21, "kl": 0.01678466796875, "learning_rate": 5.351348027240453e-06, "loss": 0.0007, "num_tokens": 2074155085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6889135444226338, "frac_reward_zero_std": 1.0, "grad_norm": 4.688910971061114e-21, "kl": 0.0166015625, "learning_rate": 5.346073851801299e-06, "loss": 0.0007, "num_tokens": 2074718317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6890842365793292, "frac_reward_zero_std": 1.0, "grad_norm": 4.926518934027354e-21, "kl": 0.017242431640625, "learning_rate": 5.340801328337927e-06, "loss": 0.0007, "num_tokens": 2075291037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6892549287360246, "frac_reward_zero_std": 1.0, "grad_norm": 4.8143442387754456e-21, "kl": 0.0169677734375, "learning_rate": 5.3355304587219e-06, "loss": 0.0007, "num_tokens": 2075859245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.68942562089272, "frac_reward_zero_std": 1.0, "grad_norm": 4.708341333663759e-21, "kl": 0.01690673828125, "learning_rate": 5.330261244824178e-06, "loss": 0.0007, "num_tokens": 2076423373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6895963130494154, "frac_reward_zero_std": 1.0, "grad_norm": 4.4323305310929736e-21, "kl": 0.01617431640625, "learning_rate": 5.324993688515152e-06, "loss": 0.0006, "num_tokens": 2076982093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6897670052061108, "frac_reward_zero_std": 1.0, "grad_norm": 4.723057025918051e-21, "kl": 0.01715087890625, "learning_rate": 5.319727791664603e-06, "loss": 0.0007, "num_tokens": 2077548365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6899376973628062, "frac_reward_zero_std": 1.0, "grad_norm": 4.645252122048251e-21, "kl": 0.01690673828125, "learning_rate": 5.314463556141743e-06, "loss": 0.0007, "num_tokens": 2078118093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6901083895195016, "frac_reward_zero_std": 1.0, "grad_norm": 4.457681843525221e-21, "kl": 0.016754150390625, "learning_rate": 5.30920098381518e-06, "loss": 0.0007, "num_tokens": 2078682573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.690279081676197, "frac_reward_zero_std": 1.0, "grad_norm": 4.3131222467406104e-21, "kl": 0.016876220703125, "learning_rate": 5.303940076552943e-06, "loss": 0.0007, "num_tokens": 2079243629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6904497738328924, "frac_reward_zero_std": 1.0, "grad_norm": 4.405973632196602e-21, "kl": 0.016326904296875, "learning_rate": 5.298680836222457e-06, "loss": 0.0007, "num_tokens": 2079804381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6906204659895878, "frac_reward_zero_std": 1.0, "grad_norm": 4.676894703411311e-21, "kl": 0.016754150390625, "learning_rate": 5.29342326469057e-06, "loss": 0.0007, "num_tokens": 2080364653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6907911581462832, "frac_reward_zero_std": 1.0, "grad_norm": 4.225787938871599e-21, "kl": 0.016937255859375, "learning_rate": 5.2881673638235235e-06, "loss": 0.0007, "num_tokens": 2080931197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6909618503029786, "frac_reward_zero_std": 1.0, "grad_norm": 4.380821063794296e-21, "kl": 0.016632080078125, "learning_rate": 5.282913135486978e-06, "loss": 0.0007, "num_tokens": 2081497677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6911325424596739, "frac_reward_zero_std": 1.0, "grad_norm": 4.724487464754954e-21, "kl": 0.016845703125, "learning_rate": 5.277660581545989e-06, "loss": 0.0007, "num_tokens": 2082062941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6913032346163693, "frac_reward_zero_std": 1.0, "grad_norm": 5.330191050540164e-21, "kl": 0.01727294921875, "learning_rate": 5.272409703865027e-06, "loss": 0.0007, "num_tokens": 2082634573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6914739267730647, "frac_reward_zero_std": 1.0, "grad_norm": 4.929957989197421e-21, "kl": 0.016937255859375, "learning_rate": 5.267160504307968e-06, "loss": 0.0007, "num_tokens": 2083197389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6916446189297601, "frac_reward_zero_std": 1.0, "grad_norm": 5.584414215643458e-21, "kl": 0.017486572265625, "learning_rate": 5.261912984738083e-06, "loss": 0.0007, "num_tokens": 2083764061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6918153110864556, "frac_reward_zero_std": 1.0, "grad_norm": 4.448543685783134e-21, "kl": 0.0167236328125, "learning_rate": 5.256667147018051e-06, "loss": 0.0007, "num_tokens": 2084328157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.691986003243151, "frac_reward_zero_std": 1.0, "grad_norm": 5.057002436059512e-21, "kl": 0.016998291015625, "learning_rate": 5.251422993009956e-06, "loss": 0.0007, "num_tokens": 2084889741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6921566953998464, "frac_reward_zero_std": 1.0, "grad_norm": 4.775871131717578e-21, "kl": 0.017364501953125, "learning_rate": 5.24618052457529e-06, "loss": 0.0007, "num_tokens": 2085460589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6923273875565418, "frac_reward_zero_std": 1.0, "grad_norm": 4.92806113715303e-21, "kl": 0.01690673828125, "learning_rate": 5.240939743574933e-06, "loss": 0.0007, "num_tokens": 2086030813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6924980797132372, "frac_reward_zero_std": 1.0, "grad_norm": 5.435879947259185e-21, "kl": 0.01702880859375, "learning_rate": 5.235700651869173e-06, "loss": 0.0007, "num_tokens": 2086604141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6926687718699326, "frac_reward_zero_std": 1.0, "grad_norm": 4.7873650783408765e-21, "kl": 0.01739501953125, "learning_rate": 5.230463251317699e-06, "loss": 0.0007, "num_tokens": 2087176445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.692839464026628, "frac_reward_zero_std": 1.0, "grad_norm": 5.165503673244538e-21, "kl": 0.016937255859375, "learning_rate": 5.225227543779604e-06, "loss": 0.0007, "num_tokens": 2087741949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6930101561833234, "frac_reward_zero_std": 1.0, "grad_norm": 4.569963988743259e-21, "kl": 0.0167236328125, "learning_rate": 5.219993531113372e-06, "loss": 0.0007, "num_tokens": 2088305757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6931808483400188, "frac_reward_zero_std": 1.0, "grad_norm": 5.7614214353314496e-21, "kl": 0.017486572265625, "learning_rate": 5.214761215176884e-06, "loss": 0.0007, "num_tokens": 2088874877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6933515404967142, "frac_reward_zero_std": 1.0, "grad_norm": 5.158082566137293e-21, "kl": 0.016845703125, "learning_rate": 5.209530597827428e-06, "loss": 0.0007, "num_tokens": 2089443213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6935222326534096, "frac_reward_zero_std": 1.0, "grad_norm": 5.011212426451824e-21, "kl": 0.017059326171875, "learning_rate": 5.204301680921688e-06, "loss": 0.0007, "num_tokens": 2090004381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.693692924810105, "frac_reward_zero_std": 1.0, "grad_norm": 4.5343785365834785e-21, "kl": 0.01739501953125, "learning_rate": 5.199074466315735e-06, "loss": 0.0007, "num_tokens": 2090563037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6938636169668003, "frac_reward_zero_std": 1.0, "grad_norm": 4.952554824508453e-21, "kl": 0.0167236328125, "learning_rate": 5.193848955865041e-06, "loss": 0.0007, "num_tokens": 2091127645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6940343091234957, "frac_reward_zero_std": 1.0, "grad_norm": 4.759596243590901e-21, "kl": 0.01708984375, "learning_rate": 5.188625151424474e-06, "loss": 0.0007, "num_tokens": 2091713661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6942050012801911, "frac_reward_zero_std": 1.0, "grad_norm": 4.755887797142524e-21, "kl": 0.0169677734375, "learning_rate": 5.1834030548483014e-06, "loss": 0.0007, "num_tokens": 2092279085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6943756934368865, "frac_reward_zero_std": 1.0, "grad_norm": 4.795916472719731e-21, "kl": 0.016448974609375, "learning_rate": 5.178182667990177e-06, "loss": 0.0007, "num_tokens": 2092839997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.694546385593582, "frac_reward_zero_std": 1.0, "grad_norm": 4.900921023543359e-21, "kl": 0.0172119140625, "learning_rate": 5.172963992703142e-06, "loss": 0.0007, "num_tokens": 2093406317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6947170777502774, "frac_reward_zero_std": 1.0, "grad_norm": 5.6160347539976806e-21, "kl": 0.01708984375, "learning_rate": 5.1677470308396445e-06, "loss": 0.0007, "num_tokens": 2093972749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6948877699069728, "frac_reward_zero_std": 1.0, "grad_norm": 4.667904631139015e-21, "kl": 0.017120361328125, "learning_rate": 5.1625317842515165e-06, "loss": 0.0007, "num_tokens": 2094536109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6950584620636682, "frac_reward_zero_std": 1.0, "grad_norm": 4.71295134867282e-21, "kl": 0.017242431640625, "learning_rate": 5.1573182547899905e-06, "loss": 0.0007, "num_tokens": 2095101661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6952291542203636, "frac_reward_zero_std": 1.0, "grad_norm": 5.654991706802742e-21, "kl": 0.017181396484375, "learning_rate": 5.152106444305665e-06, "loss": 0.0007, "num_tokens": 2095671997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.695399846377059, "frac_reward_zero_std": 1.0, "grad_norm": 4.6610401264479956e-21, "kl": 0.017059326171875, "learning_rate": 5.1468963546485536e-06, "loss": 0.0007, "num_tokens": 2096239837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6955705385337544, "frac_reward_zero_std": 1.0, "grad_norm": 4.762035471604699e-21, "kl": 0.0166015625, "learning_rate": 5.141687987668049e-06, "loss": 0.0007, "num_tokens": 2096804813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6957412306904498, "frac_reward_zero_std": 1.0, "grad_norm": 5.201219470472955e-21, "kl": 0.0167236328125, "learning_rate": 5.1364813452129384e-06, "loss": 0.0007, "num_tokens": 2097370349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6959119228471452, "frac_reward_zero_std": 1.0, "grad_norm": 4.443457699417868e-21, "kl": 0.016876220703125, "learning_rate": 5.131276429131385e-06, "loss": 0.0007, "num_tokens": 2097935693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6960826150038406, "frac_reward_zero_std": 1.0, "grad_norm": 4.496898230386797e-21, "kl": 0.017120361328125, "learning_rate": 5.126073241270946e-06, "loss": 0.0007, "num_tokens": 2098506253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.696253307160536, "frac_reward_zero_std": 1.0, "grad_norm": 4.242752488413893e-21, "kl": 0.016693115234375, "learning_rate": 5.120871783478568e-06, "loss": 0.0007, "num_tokens": 2099071229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6964239993172314, "frac_reward_zero_std": 1.0, "grad_norm": 5.069024626706602e-21, "kl": 0.017181396484375, "learning_rate": 5.115672057600585e-06, "loss": 0.0007, "num_tokens": 2099637933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6965946914739267, "frac_reward_zero_std": 1.0, "grad_norm": 4.92682833258906e-21, "kl": 0.01678466796875, "learning_rate": 5.110474065482705e-06, "loss": 0.0007, "num_tokens": 2100204173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6967653836306221, "frac_reward_zero_std": 1.0, "grad_norm": 4.688186685872074e-21, "kl": 0.017303466796875, "learning_rate": 5.105277808970028e-06, "loss": 0.0007, "num_tokens": 2100773629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6969360757873175, "frac_reward_zero_std": 1.0, "grad_norm": 5.0758972372797736e-21, "kl": 0.017303466796875, "learning_rate": 5.10008328990704e-06, "loss": 0.0007, "num_tokens": 2101344781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6971067679440129, "frac_reward_zero_std": 1.0, "grad_norm": 4.5828404649210324e-21, "kl": 0.016632080078125, "learning_rate": 5.0948905101376105e-06, "loss": 0.0007, "num_tokens": 2101909101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6972774601007083, "frac_reward_zero_std": 1.0, "grad_norm": 4.6449850649835986e-21, "kl": 0.01727294921875, "learning_rate": 5.089699471504985e-06, "loss": 0.0007, "num_tokens": 2102474877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6974481522574038, "frac_reward_zero_std": 1.0, "grad_norm": 4.9959677649811984e-21, "kl": 0.016632080078125, "learning_rate": 5.084510175851791e-06, "loss": 0.0007, "num_tokens": 2103036973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6976188444140992, "frac_reward_zero_std": 1.0, "grad_norm": 5.028976439774109e-21, "kl": 0.017364501953125, "learning_rate": 5.079322625020047e-06, "loss": 0.0007, "num_tokens": 2103604733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6977895365707946, "frac_reward_zero_std": 1.0, "grad_norm": 4.2446617692625234e-21, "kl": 0.01629638671875, "learning_rate": 5.074136820851149e-06, "loss": 0.0007, "num_tokens": 2104165597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.69796022872749, "frac_reward_zero_std": 1.0, "grad_norm": 4.487079760838864e-21, "kl": 0.017486572265625, "learning_rate": 5.068952765185865e-06, "loss": 0.0007, "num_tokens": 2104732781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6981309208841854, "frac_reward_zero_std": 1.0, "grad_norm": 5.452818889659143e-21, "kl": 0.017059326171875, "learning_rate": 5.063770459864346e-06, "loss": 0.0007, "num_tokens": 2105301341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6983016130408808, "frac_reward_zero_std": 1.0, "grad_norm": 4.390290499793697e-21, "kl": 0.016571044921875, "learning_rate": 5.058589906726125e-06, "loss": 0.0007, "num_tokens": 2105866141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6984723051975762, "frac_reward_zero_std": 1.0, "grad_norm": 4.853045408156235e-21, "kl": 0.01666259765625, "learning_rate": 5.0534111076101165e-06, "loss": 0.0007, "num_tokens": 2106435933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6986429973542716, "frac_reward_zero_std": 1.0, "grad_norm": 4.573139071084176e-21, "kl": 0.016937255859375, "learning_rate": 5.0482340643546e-06, "loss": 0.0007, "num_tokens": 2107000125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.698813689510967, "frac_reward_zero_std": 1.0, "grad_norm": 4.665898751899054e-21, "kl": 0.01678466796875, "learning_rate": 5.0430587787972454e-06, "loss": 0.0007, "num_tokens": 2107563325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6989843816676624, "frac_reward_zero_std": 1.0, "grad_norm": 4.203785499869432e-21, "kl": 0.017303466796875, "learning_rate": 5.037885252775085e-06, "loss": 0.0007, "num_tokens": 2108126093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6991550738243578, "frac_reward_zero_std": 1.0, "grad_norm": 5.128616408093247e-21, "kl": 0.0167236328125, "learning_rate": 5.032713488124542e-06, "loss": 0.0007, "num_tokens": 2108691773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6993257659810531, "frac_reward_zero_std": 1.0, "grad_norm": 4.318751408579569e-21, "kl": 0.016693115234375, "learning_rate": 5.027543486681399e-06, "loss": 0.0007, "num_tokens": 2109254701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6994964581377485, "frac_reward_zero_std": 1.0, "grad_norm": 5.113959186189723e-21, "kl": 0.017364501953125, "learning_rate": 5.0223752502808255e-06, "loss": 0.0007, "num_tokens": 2109820093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6996671502944439, "frac_reward_zero_std": 1.0, "grad_norm": 5.214244967090402e-21, "kl": 0.0169677734375, "learning_rate": 5.017208780757353e-06, "loss": 0.0007, "num_tokens": 2110386525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6998378424511393, "frac_reward_zero_std": 1.0, "grad_norm": 5.2353261871591866e-21, "kl": 0.01678466796875, "learning_rate": 5.012044079944897e-06, "loss": 0.0007, "num_tokens": 2110950301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7000085346078347, "frac_reward_zero_std": 1.0, "grad_norm": 5.321792319517485e-21, "kl": 0.01690673828125, "learning_rate": 5.0068811496767365e-06, "loss": 0.0007, "num_tokens": 2111515101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7001792267645301, "frac_reward_zero_std": 1.0, "grad_norm": 4.9136617350564686e-21, "kl": 0.017242431640625, "learning_rate": 5.00171999178553e-06, "loss": 0.0007, "num_tokens": 2112078125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7003499189212256, "frac_reward_zero_std": 1.0, "grad_norm": 4.811254228881355e-21, "kl": 0.0167236328125, "learning_rate": 4.9965606081032965e-06, "loss": 0.0007, "num_tokens": 2112646253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.700520611077921, "frac_reward_zero_std": 1.0, "grad_norm": 4.937852885569765e-21, "kl": 0.01690673828125, "learning_rate": 4.991403000461432e-06, "loss": 0.0007, "num_tokens": 2113208925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7006913032346164, "frac_reward_zero_std": 1.0, "grad_norm": 4.961616277478687e-21, "kl": 0.01690673828125, "learning_rate": 4.986247170690702e-06, "loss": 0.0007, "num_tokens": 2113772605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7008619953913118, "frac_reward_zero_std": 1.0, "grad_norm": 4.90264285539229e-21, "kl": 0.01702880859375, "learning_rate": 4.981093120621243e-06, "loss": 0.0007, "num_tokens": 2114339453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7010326875480072, "frac_reward_zero_std": 1.0, "grad_norm": 4.900362909984842e-21, "kl": 0.016937255859375, "learning_rate": 4.975940852082555e-06, "loss": 0.0007, "num_tokens": 2114908013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7012033797047026, "frac_reward_zero_std": 1.0, "grad_norm": 4.729897946077995e-21, "kl": 0.017333984375, "learning_rate": 4.970790366903503e-06, "loss": 0.0007, "num_tokens": 2115470541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.701374071861398, "frac_reward_zero_std": 1.0, "grad_norm": 4.362604943102211e-21, "kl": 0.016845703125, "learning_rate": 4.965641666912325e-06, "loss": 0.0007, "num_tokens": 2116037069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7015447640180934, "frac_reward_zero_std": 1.0, "grad_norm": 4.575003800148431e-21, "kl": 0.017303466796875, "learning_rate": 4.960494753936631e-06, "loss": 0.0007, "num_tokens": 2116603453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7017154561747888, "frac_reward_zero_std": 1.0, "grad_norm": 5.517143924919189e-21, "kl": 0.0172119140625, "learning_rate": 4.955349629803383e-06, "loss": 0.0007, "num_tokens": 2117166813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7018861483314842, "frac_reward_zero_std": 1.0, "grad_norm": 4.639214105732927e-21, "kl": 0.016693115234375, "learning_rate": 4.950206296338911e-06, "loss": 0.0007, "num_tokens": 2117729421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7020568404881796, "frac_reward_zero_std": 1.0, "grad_norm": 5.503861181130317e-21, "kl": 0.01739501953125, "learning_rate": 4.945064755368917e-06, "loss": 0.0007, "num_tokens": 2118296013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7022275326448749, "frac_reward_zero_std": 1.0, "grad_norm": 4.375302363886373e-21, "kl": 0.017120361328125, "learning_rate": 4.939925008718461e-06, "loss": 0.0007, "num_tokens": 2118859373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7023982248015703, "frac_reward_zero_std": 1.0, "grad_norm": 5.484588228960428e-21, "kl": 0.01751708984375, "learning_rate": 4.934787058211978e-06, "loss": 0.0007, "num_tokens": 2119431501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7025689169582657, "frac_reward_zero_std": 1.0, "grad_norm": 4.629200642909388e-21, "kl": 0.017059326171875, "learning_rate": 4.929650905673237e-06, "loss": 0.0007, "num_tokens": 2119996653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7027396091149611, "frac_reward_zero_std": 1.0, "grad_norm": 4.860534325064562e-21, "kl": 0.01708984375, "learning_rate": 4.924516552925396e-06, "loss": 0.0007, "num_tokens": 2120557341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7029103012716565, "frac_reward_zero_std": 1.0, "grad_norm": 5.374064856386322e-21, "kl": 0.0166015625, "learning_rate": 4.919384001790963e-06, "loss": 0.0007, "num_tokens": 2121123933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.703080993428352, "frac_reward_zero_std": 1.0, "grad_norm": 5.250939186474203e-21, "kl": 0.016876220703125, "learning_rate": 4.9142532540918156e-06, "loss": 0.0007, "num_tokens": 2121694045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7032516855850474, "frac_reward_zero_std": 1.0, "grad_norm": 4.5252442415657616e-21, "kl": 0.016876220703125, "learning_rate": 4.909124311649176e-06, "loss": 0.0007, "num_tokens": 2122257693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7034223777417428, "frac_reward_zero_std": 1.0, "grad_norm": 4.840838651528124e-21, "kl": 0.016387939453125, "learning_rate": 4.903997176283634e-06, "loss": 0.0007, "num_tokens": 2122835293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7035930698984382, "frac_reward_zero_std": 1.0, "grad_norm": 4.810482592413149e-21, "kl": 0.016693115234375, "learning_rate": 4.898871849815137e-06, "loss": 0.0007, "num_tokens": 2123402829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7037637620551336, "frac_reward_zero_std": 1.0, "grad_norm": 4.497995870068549e-21, "kl": 0.01715087890625, "learning_rate": 4.8937483340629985e-06, "loss": 0.0007, "num_tokens": 2123968381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.703934454211829, "frac_reward_zero_std": 1.0, "grad_norm": 4.9205643983898984e-21, "kl": 0.01739501953125, "learning_rate": 4.888626630845875e-06, "loss": 0.0007, "num_tokens": 2124530653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7041051463685244, "frac_reward_zero_std": 1.0, "grad_norm": 4.7199373767771196e-21, "kl": 0.01702880859375, "learning_rate": 4.8835067419817846e-06, "loss": 0.0007, "num_tokens": 2125096413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7042758385252198, "frac_reward_zero_std": 1.0, "grad_norm": 5.307459661905344e-21, "kl": 0.016876220703125, "learning_rate": 4.8783886692881064e-06, "loss": 0.0007, "num_tokens": 2125664925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7044465306819152, "frac_reward_zero_std": 1.0, "grad_norm": 5.083171576435832e-21, "kl": 0.0167236328125, "learning_rate": 4.873272414581575e-06, "loss": 0.0007, "num_tokens": 2126229805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7046172228386106, "frac_reward_zero_std": 1.0, "grad_norm": 4.542110282202593e-21, "kl": 0.016937255859375, "learning_rate": 4.868157979678272e-06, "loss": 0.0007, "num_tokens": 2126806669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.704787914995306, "frac_reward_zero_std": 1.0, "grad_norm": 4.8632876175478055e-21, "kl": 0.016845703125, "learning_rate": 4.863045366393633e-06, "loss": 0.0007, "num_tokens": 2127370381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7049586071520013, "frac_reward_zero_std": 1.0, "grad_norm": 4.991107294341296e-21, "kl": 0.01708984375, "learning_rate": 4.857934576542456e-06, "loss": 0.0007, "num_tokens": 2127936589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7051292993086967, "frac_reward_zero_std": 1.0, "grad_norm": 5.005400789708489e-21, "kl": 0.01708984375, "learning_rate": 4.85282561193889e-06, "loss": 0.0007, "num_tokens": 2128504013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7052999914653921, "frac_reward_zero_std": 1.0, "grad_norm": 4.958140580084957e-21, "kl": 0.016571044921875, "learning_rate": 4.847718474396429e-06, "loss": 0.0007, "num_tokens": 2129069885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7054706836220875, "frac_reward_zero_std": 1.0, "grad_norm": 4.73024786551288e-21, "kl": 0.016998291015625, "learning_rate": 4.84261316572792e-06, "loss": 0.0007, "num_tokens": 2129635149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7056413757787829, "frac_reward_zero_std": 1.0, "grad_norm": 4.5841048901936235e-21, "kl": 0.016632080078125, "learning_rate": 4.837509687745565e-06, "loss": 0.0007, "num_tokens": 2130200925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7058120679354783, "frac_reward_zero_std": 1.0, "grad_norm": 4.480680926502222e-21, "kl": 0.016998291015625, "learning_rate": 4.83240804226092e-06, "loss": 0.0007, "num_tokens": 2130765949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7059827600921738, "frac_reward_zero_std": 1.0, "grad_norm": 4.690340323405563e-21, "kl": 0.0169677734375, "learning_rate": 4.827308231084877e-06, "loss": 0.0007, "num_tokens": 2131340861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7061534522488692, "frac_reward_zero_std": 1.0, "grad_norm": 4.982621510614959e-21, "kl": 0.01708984375, "learning_rate": 4.822210256027692e-06, "loss": 0.0007, "num_tokens": 2131906973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7063241444055646, "frac_reward_zero_std": 1.0, "grad_norm": 4.379956405033257e-21, "kl": 0.017120361328125, "learning_rate": 4.817114118898956e-06, "loss": 0.0007, "num_tokens": 2132476621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.70649483656226, "frac_reward_zero_std": 1.0, "grad_norm": 4.7837855799412265e-21, "kl": 0.017120361328125, "learning_rate": 4.812019821507619e-06, "loss": 0.0007, "num_tokens": 2133039069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7066655287189554, "frac_reward_zero_std": 1.0, "grad_norm": 4.872110450900013e-21, "kl": 0.017059326171875, "learning_rate": 4.806927365661966e-06, "loss": 0.0007, "num_tokens": 2133604141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7068362208756508, "frac_reward_zero_std": 1.0, "grad_norm": 4.9303935832538885e-21, "kl": 0.016754150390625, "learning_rate": 4.801836753169643e-06, "loss": 0.0007, "num_tokens": 2134178141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7070069130323462, "frac_reward_zero_std": 1.0, "grad_norm": 4.804491347712872e-21, "kl": 0.01678466796875, "learning_rate": 4.796747985837627e-06, "loss": 0.0007, "num_tokens": 2134739629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7071776051890416, "frac_reward_zero_std": 1.0, "grad_norm": 4.750919678677764e-21, "kl": 0.016937255859375, "learning_rate": 4.791661065472253e-06, "loss": 0.0007, "num_tokens": 2135303341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.707348297345737, "frac_reward_zero_std": 1.0, "grad_norm": 4.85523824351582e-21, "kl": 0.016937255859375, "learning_rate": 4.786575993879186e-06, "loss": 0.0007, "num_tokens": 2135868941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7075189895024324, "frac_reward_zero_std": 1.0, "grad_norm": 4.489031566124657e-21, "kl": 0.016204833984375, "learning_rate": 4.7814927728634505e-06, "loss": 0.0006, "num_tokens": 2136430701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7076896816591277, "frac_reward_zero_std": 1.0, "grad_norm": 4.8882405063152025e-21, "kl": 0.0177001953125, "learning_rate": 4.776411404229401e-06, "loss": 0.0007, "num_tokens": 2136993389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7078603738158231, "frac_reward_zero_std": 1.0, "grad_norm": 4.682559688524627e-21, "kl": 0.01678466796875, "learning_rate": 4.7713318897807445e-06, "loss": 0.0007, "num_tokens": 2137564605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7080310659725185, "frac_reward_zero_std": 1.0, "grad_norm": 4.293047134508351e-21, "kl": 0.01702880859375, "learning_rate": 4.766254231320521e-06, "loss": 0.0007, "num_tokens": 2138126365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7082017581292139, "frac_reward_zero_std": 1.0, "grad_norm": 5.160103998278784e-21, "kl": 0.016754150390625, "learning_rate": 4.761178430651121e-06, "loss": 0.0007, "num_tokens": 2138693757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7083724502859093, "frac_reward_zero_std": 1.0, "grad_norm": 5.306831787343974e-21, "kl": 0.0177001953125, "learning_rate": 4.7561044895742635e-06, "loss": 0.0007, "num_tokens": 2139258301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7085431424426047, "frac_reward_zero_std": 1.0, "grad_norm": 4.660980302794877e-21, "kl": 0.017120361328125, "learning_rate": 4.751032409891023e-06, "loss": 0.0007, "num_tokens": 2139824221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7087138345993002, "frac_reward_zero_std": 1.0, "grad_norm": 5.130454202175898e-21, "kl": 0.0167236328125, "learning_rate": 4.7459621934017955e-06, "loss": 0.0007, "num_tokens": 2140394877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7088845267559956, "frac_reward_zero_std": 1.0, "grad_norm": 4.6147954039277926e-21, "kl": 0.016571044921875, "learning_rate": 4.740893841906334e-06, "loss": 0.0007, "num_tokens": 2140961277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.709055218912691, "frac_reward_zero_std": 1.0, "grad_norm": 4.721679600557422e-21, "kl": 0.0169677734375, "learning_rate": 4.735827357203712e-06, "loss": 0.0007, "num_tokens": 2141526093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7092259110693864, "frac_reward_zero_std": 1.0, "grad_norm": 4.62834409659887e-21, "kl": 0.01678466796875, "learning_rate": 4.730762741092355e-06, "loss": 0.0007, "num_tokens": 2142088701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7093966032260818, "frac_reward_zero_std": 1.0, "grad_norm": 4.712585600752109e-21, "kl": 0.0169677734375, "learning_rate": 4.725699995370013e-06, "loss": 0.0007, "num_tokens": 2142651325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7095672953827772, "frac_reward_zero_std": 1.0, "grad_norm": 5.003920587769065e-21, "kl": 0.01708984375, "learning_rate": 4.720639121833782e-06, "loss": 0.0007, "num_tokens": 2143220365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7097379875394726, "frac_reward_zero_std": 1.0, "grad_norm": 4.9054169410114955e-21, "kl": 0.0167236328125, "learning_rate": 4.715580122280093e-06, "loss": 0.0007, "num_tokens": 2143785197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.709908679696168, "frac_reward_zero_std": 1.0, "grad_norm": 4.6967063120458734e-21, "kl": 0.016204833984375, "learning_rate": 4.710522998504703e-06, "loss": 0.0006, "num_tokens": 2144351405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7100793718528634, "frac_reward_zero_std": 1.0, "grad_norm": 5.3588574217045735e-21, "kl": 0.017242431640625, "learning_rate": 4.705467752302706e-06, "loss": 0.0007, "num_tokens": 2144916621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7102500640095588, "frac_reward_zero_std": 1.0, "grad_norm": 4.456735055669061e-21, "kl": 0.017120361328125, "learning_rate": 4.700414385468536e-06, "loss": 0.0007, "num_tokens": 2145483005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7104207561662541, "frac_reward_zero_std": 1.0, "grad_norm": 4.823503595687127e-21, "kl": 0.0172119140625, "learning_rate": 4.695362899795958e-06, "loss": 0.0007, "num_tokens": 2146049293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7105914483229495, "frac_reward_zero_std": 1.0, "grad_norm": 4.20429750027403e-21, "kl": 0.016632080078125, "learning_rate": 4.690313297078064e-06, "loss": 0.0007, "num_tokens": 2146614109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7107621404796449, "frac_reward_zero_std": 1.0, "grad_norm": 5.034939166215432e-21, "kl": 0.0167236328125, "learning_rate": 4.685265579107278e-06, "loss": 0.0007, "num_tokens": 2147180109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7109328326363403, "frac_reward_zero_std": 1.0, "grad_norm": 4.4070466701219606e-21, "kl": 0.016998291015625, "learning_rate": 4.68021974767536e-06, "loss": 0.0007, "num_tokens": 2147743005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7111035247930357, "frac_reward_zero_std": 1.0, "grad_norm": 5.297173217979257e-21, "kl": 0.016632080078125, "learning_rate": 4.675175804573402e-06, "loss": 0.0007, "num_tokens": 2148306957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7112742169497311, "frac_reward_zero_std": 1.0, "grad_norm": 5.086199496350253e-21, "kl": 0.01690673828125, "learning_rate": 4.670133751591817e-06, "loss": 0.0007, "num_tokens": 2148866189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7114449091064265, "frac_reward_zero_std": 1.0, "grad_norm": 5.326232390084091e-21, "kl": 0.017425537109375, "learning_rate": 4.665093590520352e-06, "loss": 0.0007, "num_tokens": 2149430653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.711615601263122, "frac_reward_zero_std": 1.0, "grad_norm": 4.579106341059513e-21, "kl": 0.0169677734375, "learning_rate": 4.660055323148082e-06, "loss": 0.0007, "num_tokens": 2149997293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7117862934198174, "frac_reward_zero_std": 1.0, "grad_norm": 4.831522064439354e-21, "kl": 0.01708984375, "learning_rate": 4.655018951263415e-06, "loss": 0.0007, "num_tokens": 2150561309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7119569855765128, "frac_reward_zero_std": 1.0, "grad_norm": 4.636388988043999e-21, "kl": 0.01641845703125, "learning_rate": 4.649984476654078e-06, "loss": 0.0007, "num_tokens": 2151123629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7121276777332082, "frac_reward_zero_std": 1.0, "grad_norm": 4.914885298324278e-21, "kl": 0.01702880859375, "learning_rate": 4.644951901107123e-06, "loss": 0.0007, "num_tokens": 2151690013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7122983698899036, "frac_reward_zero_std": 1.0, "grad_norm": 4.9409403331547516e-21, "kl": 0.01708984375, "learning_rate": 4.639921226408937e-06, "loss": 0.0007, "num_tokens": 2152257757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.712469062046599, "frac_reward_zero_std": 1.0, "grad_norm": 5.101787106115973e-21, "kl": 0.01641845703125, "learning_rate": 4.634892454345229e-06, "loss": 0.0007, "num_tokens": 2152825981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7126397542032944, "frac_reward_zero_std": 1.0, "grad_norm": 5.335641022015099e-21, "kl": 0.017669677734375, "learning_rate": 4.6298655867010365e-06, "loss": 0.0007, "num_tokens": 2153396845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7128104463599898, "frac_reward_zero_std": 1.0, "grad_norm": 4.846948896069322e-21, "kl": 0.017120361328125, "learning_rate": 4.624840625260704e-06, "loss": 0.0007, "num_tokens": 2153963501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7129811385166852, "frac_reward_zero_std": 1.0, "grad_norm": 4.896373981004531e-21, "kl": 0.0172119140625, "learning_rate": 4.619817571807915e-06, "loss": 0.0007, "num_tokens": 2154529885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7131518306733805, "frac_reward_zero_std": 1.0, "grad_norm": 4.236575757449262e-21, "kl": 0.01690673828125, "learning_rate": 4.614796428125676e-06, "loss": 0.0007, "num_tokens": 2155095053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7133225228300759, "frac_reward_zero_std": 1.0, "grad_norm": 4.9062760704163254e-21, "kl": 0.016845703125, "learning_rate": 4.609777195996316e-06, "loss": 0.0007, "num_tokens": 2155658285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7134932149867713, "frac_reward_zero_std": 1.0, "grad_norm": 5.263803540884666e-21, "kl": 0.0167236328125, "learning_rate": 4.604759877201472e-06, "loss": 0.0007, "num_tokens": 2156222109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7136639071434667, "frac_reward_zero_std": 1.0, "grad_norm": 4.6661849549473944e-21, "kl": 0.01751708984375, "learning_rate": 4.599744473522113e-06, "loss": 0.0007, "num_tokens": 2156785789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7138345993001621, "frac_reward_zero_std": 1.0, "grad_norm": 5.3108816080135874e-21, "kl": 0.01727294921875, "learning_rate": 4.594730986738528e-06, "loss": 0.0007, "num_tokens": 2157351325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7140052914568575, "frac_reward_zero_std": 1.0, "grad_norm": 4.4216518257093715e-21, "kl": 0.01678466796875, "learning_rate": 4.589719418630327e-06, "loss": 0.0007, "num_tokens": 2157912621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.714175983613553, "frac_reward_zero_std": 1.0, "grad_norm": 5.1212560203554176e-21, "kl": 0.016754150390625, "learning_rate": 4.5847097709764325e-06, "loss": 0.0007, "num_tokens": 2158485597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7143466757702484, "frac_reward_zero_std": 1.0, "grad_norm": 4.952112636272927e-21, "kl": 0.017333984375, "learning_rate": 4.5797020455550845e-06, "loss": 0.0007, "num_tokens": 2159048637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7145173679269438, "frac_reward_zero_std": 1.0, "grad_norm": 4.6846538726871396e-21, "kl": 0.01702880859375, "learning_rate": 4.574696244143849e-06, "loss": 0.0007, "num_tokens": 2159612589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7146880600836392, "frac_reward_zero_std": 1.0, "grad_norm": 4.589996370881424e-21, "kl": 0.016937255859375, "learning_rate": 4.569692368519608e-06, "loss": 0.0007, "num_tokens": 2160174733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7148587522403346, "frac_reward_zero_std": 1.0, "grad_norm": 5.103391336092839e-21, "kl": 0.017669677734375, "learning_rate": 4.564690420458554e-06, "loss": 0.0007, "num_tokens": 2160741053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.71502944439703, "frac_reward_zero_std": 1.0, "grad_norm": 4.539817807190967e-21, "kl": 0.0169677734375, "learning_rate": 4.5596904017361936e-06, "loss": 0.0007, "num_tokens": 2161302573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7152001365537254, "frac_reward_zero_std": 1.0, "grad_norm": 5.131835026218558e-21, "kl": 0.0169677734375, "learning_rate": 4.554692314127356e-06, "loss": 0.0007, "num_tokens": 2161866829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7153708287104208, "frac_reward_zero_std": 1.0, "grad_norm": 4.629273499594415e-21, "kl": 0.017425537109375, "learning_rate": 4.549696159406187e-06, "loss": 0.0007, "num_tokens": 2162430253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7155415208671162, "frac_reward_zero_std": 1.0, "grad_norm": 4.88952555252087e-21, "kl": 0.0172119140625, "learning_rate": 4.544701939346137e-06, "loss": 0.0007, "num_tokens": 2162997117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7157122130238116, "frac_reward_zero_std": 1.0, "grad_norm": 5.2324442281607735e-21, "kl": 0.0172119140625, "learning_rate": 4.539709655719969e-06, "loss": 0.0007, "num_tokens": 2163558717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7158829051805069, "frac_reward_zero_std": 1.0, "grad_norm": 5.173858390252972e-21, "kl": 0.017059326171875, "learning_rate": 4.5347193102997674e-06, "loss": 0.0007, "num_tokens": 2164122365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7160535973372023, "frac_reward_zero_std": 1.0, "grad_norm": 4.933045640949841e-21, "kl": 0.017364501953125, "learning_rate": 4.529730904856931e-06, "loss": 0.0007, "num_tokens": 2164687261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7162242894938977, "frac_reward_zero_std": 1.0, "grad_norm": 5.246069362655706e-21, "kl": 0.0172119140625, "learning_rate": 4.524744441162152e-06, "loss": 0.0007, "num_tokens": 2165260493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7163949816505931, "frac_reward_zero_std": 1.0, "grad_norm": 4.8803948451992644e-21, "kl": 0.01739501953125, "learning_rate": 4.519759920985457e-06, "loss": 0.0007, "num_tokens": 2165838781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7165656738072885, "frac_reward_zero_std": 1.0, "grad_norm": 4.603735389372518e-21, "kl": 0.016876220703125, "learning_rate": 4.5147773460961596e-06, "loss": 0.0007, "num_tokens": 2166398877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7167363659639839, "frac_reward_zero_std": 1.0, "grad_norm": 4.811798078196611e-21, "kl": 0.0169677734375, "learning_rate": 4.5097967182629035e-06, "loss": 0.0007, "num_tokens": 2166961821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7169070581206793, "frac_reward_zero_std": 1.0, "grad_norm": 4.41541972391949e-21, "kl": 0.016937255859375, "learning_rate": 4.504818039253623e-06, "loss": 0.0007, "num_tokens": 2167521789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7170777502773747, "frac_reward_zero_std": 1.0, "grad_norm": 4.419135174835478e-21, "kl": 0.0167236328125, "learning_rate": 4.499841310835578e-06, "loss": 0.0007, "num_tokens": 2168083981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7172484424340702, "frac_reward_zero_std": 1.0, "grad_norm": 4.250980275764392e-21, "kl": 0.016815185546875, "learning_rate": 4.4948665347753205e-06, "loss": 0.0007, "num_tokens": 2168659437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7174191345907656, "frac_reward_zero_std": 1.0, "grad_norm": 5.0847809628777e-21, "kl": 0.01708984375, "learning_rate": 4.489893712838722e-06, "loss": 0.0007, "num_tokens": 2169229501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.717589826747461, "frac_reward_zero_std": 1.0, "grad_norm": 4.5462546895183926e-21, "kl": 0.017303466796875, "learning_rate": 4.484922846790949e-06, "loss": 0.0007, "num_tokens": 2169797197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7177605189041564, "frac_reward_zero_std": 1.0, "grad_norm": 4.74801931211203e-21, "kl": 0.016754150390625, "learning_rate": 4.479953938396485e-06, "loss": 0.0007, "num_tokens": 2170366061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7179312110608518, "frac_reward_zero_std": 1.0, "grad_norm": 4.911127942716509e-21, "kl": 0.0169677734375, "learning_rate": 4.474986989419108e-06, "loss": 0.0007, "num_tokens": 2170929197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7181019032175472, "frac_reward_zero_std": 1.0, "grad_norm": 4.8826645529760935e-21, "kl": 0.01702880859375, "learning_rate": 4.470022001621912e-06, "loss": 0.0007, "num_tokens": 2171497757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7182725953742426, "frac_reward_zero_std": 1.0, "grad_norm": 4.828763680861745e-21, "kl": 0.0169677734375, "learning_rate": 4.465058976767281e-06, "loss": 0.0007, "num_tokens": 2172060237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.718443287530938, "frac_reward_zero_std": 1.0, "grad_norm": 5.011136775798009e-21, "kl": 0.017425537109375, "learning_rate": 4.4600979166169165e-06, "loss": 0.0007, "num_tokens": 2172628669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7186139796876334, "frac_reward_zero_std": 1.0, "grad_norm": 4.703835770384625e-21, "kl": 0.016571044921875, "learning_rate": 4.4551388229318094e-06, "loss": 0.0007, "num_tokens": 2173194653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7187846718443287, "frac_reward_zero_std": 1.0, "grad_norm": 4.707816071401515e-21, "kl": 0.01678466796875, "learning_rate": 4.450181697472266e-06, "loss": 0.0007, "num_tokens": 2173759837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7189553640010241, "frac_reward_zero_std": 1.0, "grad_norm": 5.524721815389677e-21, "kl": 0.0172119140625, "learning_rate": 4.445226541997878e-06, "loss": 0.0007, "num_tokens": 2174324525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7191260561577195, "frac_reward_zero_std": 1.0, "grad_norm": 5.1075355385655824e-21, "kl": 0.01806640625, "learning_rate": 4.440273358267556e-06, "loss": 0.0007, "num_tokens": 2174890701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7192967483144149, "frac_reward_zero_std": 1.0, "grad_norm": 5.325414196557052e-21, "kl": 0.017059326171875, "learning_rate": 4.435322148039495e-06, "loss": 0.0007, "num_tokens": 2175458925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7194674404711103, "frac_reward_zero_std": 1.0, "grad_norm": 4.061896275481784e-21, "kl": 0.016815185546875, "learning_rate": 4.4303729130712e-06, "loss": 0.0007, "num_tokens": 2176020429.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7196381326278057, "frac_reward_zero_std": 1.0, "grad_norm": 4.956034256639193e-21, "kl": 0.017059326171875, "learning_rate": 4.425425655119466e-06, "loss": 0.0007, "num_tokens": 2176588333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7198088247845011, "frac_reward_zero_std": 1.0, "grad_norm": 4.800780041985954e-21, "kl": 0.01690673828125, "learning_rate": 4.420480375940394e-06, "loss": 0.0007, "num_tokens": 2177153469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7199795169411966, "frac_reward_zero_std": 1.0, "grad_norm": 5.248582860971333e-21, "kl": 0.017242431640625, "learning_rate": 4.415537077289382e-06, "loss": 0.0007, "num_tokens": 2177715325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.720150209097892, "frac_reward_zero_std": 1.0, "grad_norm": 4.623002924993889e-21, "kl": 0.01739501953125, "learning_rate": 4.410595760921122e-06, "loss": 0.0007, "num_tokens": 2178282733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7203209012545874, "frac_reward_zero_std": 1.0, "grad_norm": 4.8971184263374945e-21, "kl": 0.0166015625, "learning_rate": 4.405656428589597e-06, "loss": 0.0007, "num_tokens": 2178852461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7204915934112828, "frac_reward_zero_std": 1.0, "grad_norm": 4.950581305545776e-21, "kl": 0.01715087890625, "learning_rate": 4.400719082048094e-06, "loss": 0.0007, "num_tokens": 2179420621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7206622855679782, "frac_reward_zero_std": 1.0, "grad_norm": 4.561813592292236e-21, "kl": 0.01690673828125, "learning_rate": 4.3957837230492e-06, "loss": 0.0007, "num_tokens": 2179997357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7208329777246736, "frac_reward_zero_std": 1.0, "grad_norm": 4.1199411280450355e-21, "kl": 0.017303466796875, "learning_rate": 4.390850353344782e-06, "loss": 0.0007, "num_tokens": 2180562093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.721003669881369, "frac_reward_zero_std": 1.0, "grad_norm": 5.0309936001873315e-21, "kl": 0.016815185546875, "learning_rate": 4.385918974686006e-06, "loss": 0.0007, "num_tokens": 2181127501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7211743620380644, "frac_reward_zero_std": 1.0, "grad_norm": 4.936598859454282e-21, "kl": 0.017059326171875, "learning_rate": 4.380989588823339e-06, "loss": 0.0007, "num_tokens": 2181690829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7213450541947598, "frac_reward_zero_std": 1.0, "grad_norm": 4.944290512662672e-21, "kl": 0.017059326171875, "learning_rate": 4.376062197506536e-06, "loss": 0.0007, "num_tokens": 2182255933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7215157463514551, "frac_reward_zero_std": 1.0, "grad_norm": 4.442395566527099e-21, "kl": 0.016693115234375, "learning_rate": 4.371136802484642e-06, "loss": 0.0007, "num_tokens": 2182819469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7216864385081505, "frac_reward_zero_std": 1.0, "grad_norm": 4.562614314050444e-21, "kl": 0.01702880859375, "learning_rate": 4.366213405505987e-06, "loss": 0.0007, "num_tokens": 2183380541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7218571306648459, "frac_reward_zero_std": 1.0, "grad_norm": 4.72635261651219e-21, "kl": 0.017181396484375, "learning_rate": 4.361292008318206e-06, "loss": 0.0007, "num_tokens": 2183954429.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7220278228215413, "frac_reward_zero_std": 1.0, "grad_norm": 4.646228034381831e-21, "kl": 0.01654052734375, "learning_rate": 4.35637261266822e-06, "loss": 0.0007, "num_tokens": 2184514781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7221985149782367, "frac_reward_zero_std": 1.0, "grad_norm": 5.315591050139545e-21, "kl": 0.01708984375, "learning_rate": 4.351455220302232e-06, "loss": 0.0007, "num_tokens": 2185080157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7223692071349321, "frac_reward_zero_std": 1.0, "grad_norm": 4.942266254264936e-21, "kl": 0.016876220703125, "learning_rate": 4.346539832965738e-06, "loss": 0.0007, "num_tokens": 2185641549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7225398992916275, "frac_reward_zero_std": 1.0, "grad_norm": 5.0775002797009496e-21, "kl": 0.01727294921875, "learning_rate": 4.341626452403525e-06, "loss": 0.0007, "num_tokens": 2186211901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.722710591448323, "frac_reward_zero_std": 1.0, "grad_norm": 3.943634560104496e-21, "kl": 0.01708984375, "learning_rate": 4.3367150803596646e-06, "loss": 0.0007, "num_tokens": 2186778669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7228812836050184, "frac_reward_zero_std": 1.0, "grad_norm": 4.886836819700859e-21, "kl": 0.01690673828125, "learning_rate": 4.331805718577526e-06, "loss": 0.0007, "num_tokens": 2187349389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7230519757617138, "frac_reward_zero_std": 1.0, "grad_norm": 5.3078086944556195e-21, "kl": 0.016876220703125, "learning_rate": 4.32689836879974e-06, "loss": 0.0007, "num_tokens": 2187913629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7232226679184092, "frac_reward_zero_std": 1.0, "grad_norm": 4.720782603575944e-21, "kl": 0.016937255859375, "learning_rate": 4.321993032768246e-06, "loss": 0.0007, "num_tokens": 2188482029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7233933600751046, "frac_reward_zero_std": 1.0, "grad_norm": 4.911775483255186e-21, "kl": 0.01708984375, "learning_rate": 4.317089712224261e-06, "loss": 0.0007, "num_tokens": 2189058685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7235640522318, "frac_reward_zero_std": 1.0, "grad_norm": 4.500543491550638e-21, "kl": 0.0167236328125, "learning_rate": 4.31218840890829e-06, "loss": 0.0007, "num_tokens": 2189616845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7237347443884954, "frac_reward_zero_std": 1.0, "grad_norm": 4.510785983653256e-21, "kl": 0.016754150390625, "learning_rate": 4.307289124560117e-06, "loss": 0.0007, "num_tokens": 2190190541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7239054365451908, "frac_reward_zero_std": 1.0, "grad_norm": 4.507811575840178e-21, "kl": 0.0167236328125, "learning_rate": 4.302391860918804e-06, "loss": 0.0007, "num_tokens": 2190768765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7240761287018862, "frac_reward_zero_std": 1.0, "grad_norm": 5.149285351186051e-21, "kl": 0.01702880859375, "learning_rate": 4.297496619722711e-06, "loss": 0.0007, "num_tokens": 2191334557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7242468208585815, "frac_reward_zero_std": 1.0, "grad_norm": 4.6437067061388644e-21, "kl": 0.016754150390625, "learning_rate": 4.292603402709471e-06, "loss": 0.0007, "num_tokens": 2191900477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7244175130152769, "frac_reward_zero_std": 1.0, "grad_norm": 4.8375758536509035e-21, "kl": 0.017608642578125, "learning_rate": 4.287712211615999e-06, "loss": 0.0007, "num_tokens": 2192467005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7245882051719723, "frac_reward_zero_std": 1.0, "grad_norm": 5.196192563316725e-21, "kl": 0.0167236328125, "learning_rate": 4.282823048178486e-06, "loss": 0.0007, "num_tokens": 2193034125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7247588973286677, "frac_reward_zero_std": 1.0, "grad_norm": 5.048131079957204e-21, "kl": 0.0166015625, "learning_rate": 4.277935914132413e-06, "loss": 0.0007, "num_tokens": 2193602205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7249295894853631, "frac_reward_zero_std": 1.0, "grad_norm": 4.938249813436547e-21, "kl": 0.0167236328125, "learning_rate": 4.273050811212539e-06, "loss": 0.0007, "num_tokens": 2194166733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7251002816420585, "frac_reward_zero_std": 1.0, "grad_norm": 4.6060929389818686e-21, "kl": 0.01727294921875, "learning_rate": 4.268167741152895e-06, "loss": 0.0007, "num_tokens": 2194732781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7252709737987539, "frac_reward_zero_std": 1.0, "grad_norm": 4.8329141053618014e-21, "kl": 0.016693115234375, "learning_rate": 4.2632867056867945e-06, "loss": 0.0007, "num_tokens": 2195303117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7254416659554493, "frac_reward_zero_std": 1.0, "grad_norm": 4.7654301603111596e-21, "kl": 0.01702880859375, "learning_rate": 4.258407706546829e-06, "loss": 0.0007, "num_tokens": 2195864765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7256123581121448, "frac_reward_zero_std": 1.0, "grad_norm": 5.552055811954291e-21, "kl": 0.0174560546875, "learning_rate": 4.253530745464871e-06, "loss": 0.0007, "num_tokens": 2196432509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7257830502688402, "frac_reward_zero_std": 1.0, "grad_norm": 4.980449721861332e-21, "kl": 0.016754150390625, "learning_rate": 4.248655824172063e-06, "loss": 0.0007, "num_tokens": 2196995629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7259537424255356, "frac_reward_zero_std": 1.0, "grad_norm": 4.8790483725081734e-21, "kl": 0.017364501953125, "learning_rate": 4.243782944398822e-06, "loss": 0.0007, "num_tokens": 2197566045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.726124434582231, "frac_reward_zero_std": 1.0, "grad_norm": 5.377073646334346e-21, "kl": 0.017578125, "learning_rate": 4.238912107874849e-06, "loss": 0.0007, "num_tokens": 2198138301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7262951267389264, "frac_reward_zero_std": 1.0, "grad_norm": 4.8061881671337645e-21, "kl": 0.0177001953125, "learning_rate": 4.234043316329118e-06, "loss": 0.0007, "num_tokens": 2198704669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7264658188956218, "frac_reward_zero_std": 1.0, "grad_norm": 5.285624046619428e-21, "kl": 0.017242431640625, "learning_rate": 4.22917657148987e-06, "loss": 0.0007, "num_tokens": 2199275165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7266365110523172, "frac_reward_zero_std": 1.0, "grad_norm": 4.439099385634154e-21, "kl": 0.016571044921875, "learning_rate": 4.2243118750846215e-06, "loss": 0.0007, "num_tokens": 2199837373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7268072032090126, "frac_reward_zero_std": 1.0, "grad_norm": 4.580988880738494e-21, "kl": 0.016998291015625, "learning_rate": 4.219449228840165e-06, "loss": 0.0007, "num_tokens": 2200404125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7269778953657079, "frac_reward_zero_std": 1.0, "grad_norm": 4.6883243594564594e-21, "kl": 0.01788330078125, "learning_rate": 4.214588634482573e-06, "loss": 0.0007, "num_tokens": 2200968221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7271485875224033, "frac_reward_zero_std": 1.0, "grad_norm": 5.347959831841464e-21, "kl": 0.0174560546875, "learning_rate": 4.209730093737169e-06, "loss": 0.0007, "num_tokens": 2201537357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7273192796790987, "frac_reward_zero_std": 1.0, "grad_norm": 4.406202899546059e-21, "kl": 0.016693115234375, "learning_rate": 4.204873608328568e-06, "loss": 0.0007, "num_tokens": 2202099757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7274899718357941, "frac_reward_zero_std": 1.0, "grad_norm": 4.753419590856481e-21, "kl": 0.016632080078125, "learning_rate": 4.200019179980641e-06, "loss": 0.0007, "num_tokens": 2202664445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7276606639924895, "frac_reward_zero_std": 1.0, "grad_norm": 4.8496945966930524e-21, "kl": 0.01708984375, "learning_rate": 4.195166810416542e-06, "loss": 0.0007, "num_tokens": 2203230829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7278313561491849, "frac_reward_zero_std": 1.0, "grad_norm": 4.740720523538947e-21, "kl": 0.0166015625, "learning_rate": 4.190316501358679e-06, "loss": 0.0007, "num_tokens": 2203798109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7280020483058803, "frac_reward_zero_std": 1.0, "grad_norm": 4.619369511663352e-21, "kl": 0.016998291015625, "learning_rate": 4.185468254528744e-06, "loss": 0.0007, "num_tokens": 2204360749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7281727404625757, "frac_reward_zero_std": 1.0, "grad_norm": 4.919922489858392e-21, "kl": 0.0162353515625, "learning_rate": 4.180622071647683e-06, "loss": 0.0006, "num_tokens": 2204926893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7283434326192711, "frac_reward_zero_std": 1.0, "grad_norm": 5.154331883053766e-21, "kl": 0.016937255859375, "learning_rate": 4.1757779544357216e-06, "loss": 0.0007, "num_tokens": 2205513693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7285141247759666, "frac_reward_zero_std": 1.0, "grad_norm": 4.260013568709765e-21, "kl": 0.0169677734375, "learning_rate": 4.170935904612341e-06, "loss": 0.0007, "num_tokens": 2206076173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.728684816932662, "frac_reward_zero_std": 1.0, "grad_norm": 5.0802353768743696e-21, "kl": 0.01690673828125, "learning_rate": 4.166095923896301e-06, "loss": 0.0007, "num_tokens": 2206642733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7288555090893574, "frac_reward_zero_std": 1.0, "grad_norm": 4.526882015935451e-21, "kl": 0.01678466796875, "learning_rate": 4.161258014005612e-06, "loss": 0.0007, "num_tokens": 2207202717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7290262012460528, "frac_reward_zero_std": 1.0, "grad_norm": 4.943486964483942e-21, "kl": 0.01708984375, "learning_rate": 4.156422176657567e-06, "loss": 0.0007, "num_tokens": 2207769997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7291968934027482, "frac_reward_zero_std": 1.0, "grad_norm": 5.539834110787107e-21, "kl": 0.01715087890625, "learning_rate": 4.1515884135687026e-06, "loss": 0.0007, "num_tokens": 2208339309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7293675855594436, "frac_reward_zero_std": 1.0, "grad_norm": 5.023933024174457e-21, "kl": 0.01708984375, "learning_rate": 4.1467567264548395e-06, "loss": 0.0007, "num_tokens": 2208906893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.729538277716139, "frac_reward_zero_std": 1.0, "grad_norm": 4.965501934818208e-21, "kl": 0.01708984375, "learning_rate": 4.141927117031045e-06, "loss": 0.0007, "num_tokens": 2209478237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7297089698728343, "frac_reward_zero_std": 1.0, "grad_norm": 4.356974043272733e-21, "kl": 0.0169677734375, "learning_rate": 4.137099587011664e-06, "loss": 0.0007, "num_tokens": 2210045293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7298796620295297, "frac_reward_zero_std": 1.0, "grad_norm": 4.456026293888996e-21, "kl": 0.01702880859375, "learning_rate": 4.132274138110287e-06, "loss": 0.0007, "num_tokens": 2210610845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7300503541862251, "frac_reward_zero_std": 1.0, "grad_norm": 4.967907422202585e-21, "kl": 0.016693115234375, "learning_rate": 4.127450772039778e-06, "loss": 0.0007, "num_tokens": 2211177261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7302210463429205, "frac_reward_zero_std": 1.0, "grad_norm": 4.889443208970688e-21, "kl": 0.0164794921875, "learning_rate": 4.122629490512262e-06, "loss": 0.0007, "num_tokens": 2211741021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7303917384996159, "frac_reward_zero_std": 1.0, "grad_norm": 4.786002878168842e-21, "kl": 0.016998291015625, "learning_rate": 4.117810295239116e-06, "loss": 0.0007, "num_tokens": 2212307453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7305624306563113, "frac_reward_zero_std": 1.0, "grad_norm": 4.3451966179361406e-21, "kl": 0.017333984375, "learning_rate": 4.112993187930977e-06, "loss": 0.0007, "num_tokens": 2212870573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7307331228130067, "frac_reward_zero_std": 1.0, "grad_norm": 4.677398352686476e-21, "kl": 0.017059326171875, "learning_rate": 4.1081781702977474e-06, "loss": 0.0007, "num_tokens": 2213438461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7309038149697021, "frac_reward_zero_std": 1.0, "grad_norm": 5.096953260522328e-21, "kl": 0.016693115234375, "learning_rate": 4.1033652440485884e-06, "loss": 0.0007, "num_tokens": 2214003597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7310745071263975, "frac_reward_zero_std": 1.0, "grad_norm": 5.610373558467018e-21, "kl": 0.016693115234375, "learning_rate": 4.098554410891912e-06, "loss": 0.0007, "num_tokens": 2214567357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.731245199283093, "frac_reward_zero_std": 1.0, "grad_norm": 5.446126937495808e-21, "kl": 0.017059326171875, "learning_rate": 4.093745672535385e-06, "loss": 0.0007, "num_tokens": 2215128973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7314158914397884, "frac_reward_zero_std": 1.0, "grad_norm": 4.6702563047737854e-21, "kl": 0.017486572265625, "learning_rate": 4.088939030685941e-06, "loss": 0.0007, "num_tokens": 2215693437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7315865835964838, "frac_reward_zero_std": 1.0, "grad_norm": 5.016800854120795e-21, "kl": 0.01678466796875, "learning_rate": 4.084134487049768e-06, "loss": 0.0007, "num_tokens": 2216258701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7317572757531792, "frac_reward_zero_std": 1.0, "grad_norm": 5.376826396528638e-21, "kl": 0.017120361328125, "learning_rate": 4.0793320433323015e-06, "loss": 0.0007, "num_tokens": 2216825805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7319279679098746, "frac_reward_zero_std": 1.0, "grad_norm": 5.442678754232566e-21, "kl": 0.016693115234375, "learning_rate": 4.0745317012382325e-06, "loss": 0.0007, "num_tokens": 2217389805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.73209866006657, "frac_reward_zero_std": 1.0, "grad_norm": 5.831393086953283e-21, "kl": 0.017181396484375, "learning_rate": 4.0697334624715126e-06, "loss": 0.0007, "num_tokens": 2217955229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7322693522232654, "frac_reward_zero_std": 1.0, "grad_norm": 4.6369478497693235e-21, "kl": 0.01727294921875, "learning_rate": 4.0649373287353454e-06, "loss": 0.0007, "num_tokens": 2218520413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7324400443799607, "frac_reward_zero_std": 1.0, "grad_norm": 4.5561613747159265e-21, "kl": 0.0166015625, "learning_rate": 4.060143301732184e-06, "loss": 0.0007, "num_tokens": 2219084589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7326107365366561, "frac_reward_zero_std": 1.0, "grad_norm": 4.726495116239679e-21, "kl": 0.01708984375, "learning_rate": 4.05535138316373e-06, "loss": 0.0007, "num_tokens": 2219655645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7327814286933515, "frac_reward_zero_std": 1.0, "grad_norm": 4.866585027797926e-21, "kl": 0.0167236328125, "learning_rate": 4.050561574730947e-06, "loss": 0.0007, "num_tokens": 2220241677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7329521208500469, "frac_reward_zero_std": 1.0, "grad_norm": 4.607574258547738e-21, "kl": 0.01666259765625, "learning_rate": 4.04577387813404e-06, "loss": 0.0007, "num_tokens": 2220804941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7331228130067423, "frac_reward_zero_std": 1.0, "grad_norm": 4.976701404383078e-21, "kl": 0.01666259765625, "learning_rate": 4.04098829507248e-06, "loss": 0.0007, "num_tokens": 2221379213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7332935051634377, "frac_reward_zero_std": 1.0, "grad_norm": 4.956214526565195e-21, "kl": 0.016937255859375, "learning_rate": 4.0362048272449605e-06, "loss": 0.0007, "num_tokens": 2221947277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7334641973201331, "frac_reward_zero_std": 1.0, "grad_norm": 5.286196686020505e-21, "kl": 0.017852783203125, "learning_rate": 4.031423476349445e-06, "loss": 0.0007, "num_tokens": 2222521757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7336348894768285, "frac_reward_zero_std": 1.0, "grad_norm": 4.73180956079105e-21, "kl": 0.016845703125, "learning_rate": 4.026644244083144e-06, "loss": 0.0007, "num_tokens": 2223086845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7338055816335239, "frac_reward_zero_std": 1.0, "grad_norm": 5.113728446850116e-21, "kl": 0.01690673828125, "learning_rate": 4.021867132142516e-06, "loss": 0.0007, "num_tokens": 2223647981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7339762737902193, "frac_reward_zero_std": 1.0, "grad_norm": 4.876159267483761e-21, "kl": 0.01702880859375, "learning_rate": 4.017092142223252e-06, "loss": 0.0007, "num_tokens": 2224213901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7341469659469148, "frac_reward_zero_std": 1.0, "grad_norm": 4.292845009217298e-21, "kl": 0.0167236328125, "learning_rate": 4.0123192760203065e-06, "loss": 0.0007, "num_tokens": 2224774813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7343176581036102, "frac_reward_zero_std": 1.0, "grad_norm": 4.35622842244597e-21, "kl": 0.01666259765625, "learning_rate": 4.007548535227875e-06, "loss": 0.0007, "num_tokens": 2225340445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7344883502603056, "frac_reward_zero_std": 1.0, "grad_norm": 4.2710755962640525e-21, "kl": 0.01678466796875, "learning_rate": 4.002779921539403e-06, "loss": 0.0007, "num_tokens": 2225902877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.734659042417001, "frac_reward_zero_std": 1.0, "grad_norm": 5.09245478166321e-21, "kl": 0.016357421875, "learning_rate": 3.99801343664757e-06, "loss": 0.0007, "num_tokens": 2226469325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7348297345736964, "frac_reward_zero_std": 1.0, "grad_norm": 5.000572761286714e-21, "kl": 0.017181396484375, "learning_rate": 3.993249082244305e-06, "loss": 0.0007, "num_tokens": 2227031933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7350004267303918, "frac_reward_zero_std": 1.0, "grad_norm": 4.676372688176805e-21, "kl": 0.016815185546875, "learning_rate": 3.988486860020785e-06, "loss": 0.0007, "num_tokens": 2227595309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7351711188870872, "frac_reward_zero_std": 1.0, "grad_norm": 4.932637262089706e-21, "kl": 0.016876220703125, "learning_rate": 3.983726771667429e-06, "loss": 0.0007, "num_tokens": 2228159885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7353418110437825, "frac_reward_zero_std": 1.0, "grad_norm": 4.4968434306718534e-21, "kl": 0.016876220703125, "learning_rate": 3.9789688188738954e-06, "loss": 0.0007, "num_tokens": 2228723037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7355125032004779, "frac_reward_zero_std": 1.0, "grad_norm": 4.8073597604608854e-21, "kl": 0.0166015625, "learning_rate": 3.974213003329079e-06, "loss": 0.0007, "num_tokens": 2229290989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7356831953571733, "frac_reward_zero_std": 1.0, "grad_norm": 5.29902258992504e-21, "kl": 0.016876220703125, "learning_rate": 3.96945932672113e-06, "loss": 0.0007, "num_tokens": 2229865485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7358538875138687, "frac_reward_zero_std": 1.0, "grad_norm": 4.893204765319895e-21, "kl": 0.017303466796875, "learning_rate": 3.964707790737432e-06, "loss": 0.0007, "num_tokens": 2230435821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7360245796705641, "frac_reward_zero_std": 1.0, "grad_norm": 4.990318289722296e-21, "kl": 0.017364501953125, "learning_rate": 3.959958397064607e-06, "loss": 0.0007, "num_tokens": 2230995885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7361952718272595, "frac_reward_zero_std": 1.0, "grad_norm": 5.255614655669839e-21, "kl": 0.01702880859375, "learning_rate": 3.955211147388516e-06, "loss": 0.0007, "num_tokens": 2231557581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7363659639839549, "frac_reward_zero_std": 1.0, "grad_norm": 4.19553470794642e-21, "kl": 0.01690673828125, "learning_rate": 3.950466043394262e-06, "loss": 0.0007, "num_tokens": 2232119821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7365366561406503, "frac_reward_zero_std": 1.0, "grad_norm": 4.448679030935546e-21, "kl": 0.01715087890625, "learning_rate": 3.94572308676619e-06, "loss": 0.0007, "num_tokens": 2232694669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7367073482973457, "frac_reward_zero_std": 1.0, "grad_norm": 5.171761229670526e-21, "kl": 0.0169677734375, "learning_rate": 3.940982279187876e-06, "loss": 0.0007, "num_tokens": 2233261149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7368780404540411, "frac_reward_zero_std": 1.0, "grad_norm": 5.3503366111601606e-21, "kl": 0.017120361328125, "learning_rate": 3.936243622342132e-06, "loss": 0.0007, "num_tokens": 2233825005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7370487326107366, "frac_reward_zero_std": 1.0, "grad_norm": 4.627859202631951e-21, "kl": 0.01678466796875, "learning_rate": 3.931507117911012e-06, "loss": 0.0007, "num_tokens": 2234390509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.737219424767432, "frac_reward_zero_std": 1.0, "grad_norm": 5.085452052100463e-21, "kl": 0.017120361328125, "learning_rate": 3.926772767575809e-06, "loss": 0.0007, "num_tokens": 2234957645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7373901169241274, "frac_reward_zero_std": 1.0, "grad_norm": 4.70276802566314e-21, "kl": 0.017059326171875, "learning_rate": 3.922040573017039e-06, "loss": 0.0007, "num_tokens": 2235522797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7375608090808228, "frac_reward_zero_std": 1.0, "grad_norm": 4.977547632037093e-21, "kl": 0.016845703125, "learning_rate": 3.917310535914468e-06, "loss": 0.0007, "num_tokens": 2236084157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7377315012375182, "frac_reward_zero_std": 1.0, "grad_norm": 4.82997285996472e-21, "kl": 0.017059326171875, "learning_rate": 3.912582657947081e-06, "loss": 0.0007, "num_tokens": 2236657517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7379021933942136, "frac_reward_zero_std": 1.0, "grad_norm": 5.3327212761786885e-21, "kl": 0.017303466796875, "learning_rate": 3.90785694079311e-06, "loss": 0.0007, "num_tokens": 2237218653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7380728855509089, "frac_reward_zero_std": 1.0, "grad_norm": 4.6379852004558366e-21, "kl": 0.017120361328125, "learning_rate": 3.9031333861300066e-06, "loss": 0.0007, "num_tokens": 2237783709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7382435777076043, "frac_reward_zero_std": 1.0, "grad_norm": 5.1180927752827e-21, "kl": 0.0172119140625, "learning_rate": 3.898411995634472e-06, "loss": 0.0007, "num_tokens": 2238353741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7384142698642997, "frac_reward_zero_std": 1.0, "grad_norm": 4.8850330601645055e-21, "kl": 0.01727294921875, "learning_rate": 3.893692770982421e-06, "loss": 0.0007, "num_tokens": 2238920973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7385849620209951, "frac_reward_zero_std": 1.0, "grad_norm": 4.578700264348081e-21, "kl": 0.016693115234375, "learning_rate": 3.888975713849014e-06, "loss": 0.0007, "num_tokens": 2239484941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7387556541776905, "frac_reward_zero_std": 1.0, "grad_norm": 4.985978379533848e-21, "kl": 0.01751708984375, "learning_rate": 3.88426082590863e-06, "loss": 0.0007, "num_tokens": 2240051357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7389263463343859, "frac_reward_zero_std": 1.0, "grad_norm": 4.330951429586806e-21, "kl": 0.017059326171875, "learning_rate": 3.879548108834891e-06, "loss": 0.0007, "num_tokens": 2240612157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7390970384910813, "frac_reward_zero_std": 1.0, "grad_norm": 5.134170844645097e-21, "kl": 0.016693115234375, "learning_rate": 3.874837564300634e-06, "loss": 0.0007, "num_tokens": 2241173725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7392677306477767, "frac_reward_zero_std": 1.0, "grad_norm": 5.282540016243887e-21, "kl": 0.0174560546875, "learning_rate": 3.870129193977939e-06, "loss": 0.0007, "num_tokens": 2241740221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7394384228044721, "frac_reward_zero_std": 1.0, "grad_norm": 4.9445675141533026e-21, "kl": 0.01708984375, "learning_rate": 3.865422999538102e-06, "loss": 0.0007, "num_tokens": 2242309213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7396091149611675, "frac_reward_zero_std": 1.0, "grad_norm": 4.580953444534632e-21, "kl": 0.01654052734375, "learning_rate": 3.860718982651658e-06, "loss": 0.0007, "num_tokens": 2242877437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.739779807117863, "frac_reward_zero_std": 1.0, "grad_norm": 4.527673309855278e-21, "kl": 0.016845703125, "learning_rate": 3.856017144988356e-06, "loss": 0.0007, "num_tokens": 2243439517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7399504992745584, "frac_reward_zero_std": 1.0, "grad_norm": 4.669610876006433e-21, "kl": 0.01666259765625, "learning_rate": 3.851317488217186e-06, "loss": 0.0007, "num_tokens": 2244014653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7401211914312538, "frac_reward_zero_std": 1.0, "grad_norm": 5.2678011676023196e-21, "kl": 0.016815185546875, "learning_rate": 3.846620014006349e-06, "loss": 0.0007, "num_tokens": 2244580525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7402918835879492, "frac_reward_zero_std": 1.0, "grad_norm": 5.6476882523857195e-21, "kl": 0.016571044921875, "learning_rate": 3.841924724023282e-06, "loss": 0.0007, "num_tokens": 2245144349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7404625757446446, "frac_reward_zero_std": 1.0, "grad_norm": 4.4141883927835025e-21, "kl": 0.01763916015625, "learning_rate": 3.837231619934651e-06, "loss": 0.0007, "num_tokens": 2245709309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.74063326790134, "frac_reward_zero_std": 1.0, "grad_norm": 4.88674919584856e-21, "kl": 0.01715087890625, "learning_rate": 3.832540703406329e-06, "loss": 0.0007, "num_tokens": 2246277165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7408039600580353, "frac_reward_zero_std": 1.0, "grad_norm": 5.1672581358767854e-21, "kl": 0.016998291015625, "learning_rate": 3.8278519761034215e-06, "loss": 0.0007, "num_tokens": 2246843853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7409746522147307, "frac_reward_zero_std": 1.0, "grad_norm": 4.819964556210866e-21, "kl": 0.0167236328125, "learning_rate": 3.823165439690262e-06, "loss": 0.0007, "num_tokens": 2247409533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7411453443714261, "frac_reward_zero_std": 1.0, "grad_norm": 4.012945649892028e-21, "kl": 0.016876220703125, "learning_rate": 3.818481095830403e-06, "loss": 0.0007, "num_tokens": 2247973085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7413160365281215, "frac_reward_zero_std": 1.0, "grad_norm": 4.648243821476216e-21, "kl": 0.01739501953125, "learning_rate": 3.8137989461866153e-06, "loss": 0.0007, "num_tokens": 2248536509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7414867286848169, "frac_reward_zero_std": 1.0, "grad_norm": 4.7264547204479015e-21, "kl": 0.016632080078125, "learning_rate": 3.8091189924208893e-06, "loss": 0.0007, "num_tokens": 2249097757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7416574208415123, "frac_reward_zero_std": 1.0, "grad_norm": 4.548282830232761e-21, "kl": 0.0167236328125, "learning_rate": 3.804441236194443e-06, "loss": 0.0007, "num_tokens": 2249661165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7418281129982077, "frac_reward_zero_std": 1.0, "grad_norm": 5.1426236841974435e-21, "kl": 0.017425537109375, "learning_rate": 3.799765679167714e-06, "loss": 0.0007, "num_tokens": 2250229741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7419988051549031, "frac_reward_zero_std": 1.0, "grad_norm": 4.690879309675875e-21, "kl": 0.016937255859375, "learning_rate": 3.7950923230003533e-06, "loss": 0.0007, "num_tokens": 2250793325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7421694973115985, "frac_reward_zero_std": 1.0, "grad_norm": 4.654515904803979e-21, "kl": 0.0166015625, "learning_rate": 3.7904211693512305e-06, "loss": 0.0007, "num_tokens": 2251358797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7423401894682939, "frac_reward_zero_std": 1.0, "grad_norm": 5.268870664354326e-21, "kl": 0.016845703125, "learning_rate": 3.785752219878439e-06, "loss": 0.0007, "num_tokens": 2251927485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7425108816249893, "frac_reward_zero_std": 1.0, "grad_norm": 4.473645196249547e-21, "kl": 0.017059326171875, "learning_rate": 3.7810854762392922e-06, "loss": 0.0007, "num_tokens": 2252496605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7426815737816848, "frac_reward_zero_std": 1.0, "grad_norm": 4.8170963044609256e-21, "kl": 0.01715087890625, "learning_rate": 3.7764209400903116e-06, "loss": 0.0007, "num_tokens": 2253060477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7428522659383802, "frac_reward_zero_std": 1.0, "grad_norm": 4.9594667200085715e-21, "kl": 0.01739501953125, "learning_rate": 3.7717586130872352e-06, "loss": 0.0007, "num_tokens": 2253630909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7430229580950756, "frac_reward_zero_std": 1.0, "grad_norm": 4.876072124248184e-21, "kl": 0.0174560546875, "learning_rate": 3.7670984968850244e-06, "loss": 0.0007, "num_tokens": 2254199453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.743193650251771, "frac_reward_zero_std": 1.0, "grad_norm": 5.929344402508968e-21, "kl": 0.0167236328125, "learning_rate": 3.762440593137856e-06, "loss": 0.0007, "num_tokens": 2254773997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7433643424084664, "frac_reward_zero_std": 1.0, "grad_norm": 4.733573107216385e-21, "kl": 0.016876220703125, "learning_rate": 3.7577849034991144e-06, "loss": 0.0007, "num_tokens": 2255337405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7435350345651617, "frac_reward_zero_std": 1.0, "grad_norm": 5.146217340512447e-21, "kl": 0.016876220703125, "learning_rate": 3.753131429621397e-06, "loss": 0.0007, "num_tokens": 2255902317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7437057267218571, "frac_reward_zero_std": 1.0, "grad_norm": 5.2937272311085966e-21, "kl": 0.017578125, "learning_rate": 3.7484801731565234e-06, "loss": 0.0007, "num_tokens": 2256488925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7438764188785525, "frac_reward_zero_std": 1.0, "grad_norm": 4.636950006783392e-21, "kl": 0.017333984375, "learning_rate": 3.7438311357555212e-06, "loss": 0.0007, "num_tokens": 2257057373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7440471110352479, "frac_reward_zero_std": 1.0, "grad_norm": 4.427762633498335e-21, "kl": 0.016571044921875, "learning_rate": 3.739184319068638e-06, "loss": 0.0007, "num_tokens": 2257618781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7442178031919433, "frac_reward_zero_std": 1.0, "grad_norm": 5.039524239370728e-21, "kl": 0.017120361328125, "learning_rate": 3.734539724745313e-06, "loss": 0.0007, "num_tokens": 2258182749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7443884953486387, "frac_reward_zero_std": 1.0, "grad_norm": 5.4245806019335995e-21, "kl": 0.01666259765625, "learning_rate": 3.7298973544342145e-06, "loss": 0.0007, "num_tokens": 2258758829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7445591875053341, "frac_reward_zero_std": 1.0, "grad_norm": 4.897705260896932e-21, "kl": 0.017242431640625, "learning_rate": 3.7252572097832173e-06, "loss": 0.0007, "num_tokens": 2259332077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7447298796620295, "frac_reward_zero_std": 1.0, "grad_norm": 4.5857024223577e-21, "kl": 0.01654052734375, "learning_rate": 3.7206192924394103e-06, "loss": 0.0007, "num_tokens": 2259896493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7449005718187249, "frac_reward_zero_std": 1.0, "grad_norm": 5.444174196310039e-21, "kl": 0.017364501953125, "learning_rate": 3.7159836040490804e-06, "loss": 0.0007, "num_tokens": 2260464045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7450712639754203, "frac_reward_zero_std": 1.0, "grad_norm": 4.507484406371683e-21, "kl": 0.01715087890625, "learning_rate": 3.7113501462577284e-06, "loss": 0.0007, "num_tokens": 2261029549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7452419561321157, "frac_reward_zero_std": 1.0, "grad_norm": 5.317270113424952e-21, "kl": 0.017333984375, "learning_rate": 3.7067189207100673e-06, "loss": 0.0007, "num_tokens": 2261591117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7454126482888112, "frac_reward_zero_std": 1.0, "grad_norm": 4.511572690611206e-21, "kl": 0.016693115234375, "learning_rate": 3.702089929050018e-06, "loss": 0.0007, "num_tokens": 2262157949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7455833404455066, "frac_reward_zero_std": 1.0, "grad_norm": 5.2890561998238555e-21, "kl": 0.01727294921875, "learning_rate": 3.6974631729207024e-06, "loss": 0.0007, "num_tokens": 2262730157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.745754032602202, "frac_reward_zero_std": 1.0, "grad_norm": 4.8470064276527615e-21, "kl": 0.0167236328125, "learning_rate": 3.692838653964449e-06, "loss": 0.0007, "num_tokens": 2263291789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7459247247588974, "frac_reward_zero_std": 1.0, "grad_norm": 4.8962169288129736e-21, "kl": 0.0166015625, "learning_rate": 3.6882163738227973e-06, "loss": 0.0007, "num_tokens": 2263857309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7460954169155928, "frac_reward_zero_std": 1.0, "grad_norm": 4.441402356620999e-21, "kl": 0.016632080078125, "learning_rate": 3.6835963341364945e-06, "loss": 0.0007, "num_tokens": 2264425293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7462661090722881, "frac_reward_zero_std": 1.0, "grad_norm": 5.200224231393513e-21, "kl": 0.017364501953125, "learning_rate": 3.678978536545483e-06, "loss": 0.0007, "num_tokens": 2264992573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7464368012289835, "frac_reward_zero_std": 1.0, "grad_norm": 5.2974027461967414e-21, "kl": 0.017669677734375, "learning_rate": 3.6743629826889136e-06, "loss": 0.0007, "num_tokens": 2265559661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7466074933856789, "frac_reward_zero_std": 1.0, "grad_norm": 4.80856275046122e-21, "kl": 0.01702880859375, "learning_rate": 3.669749674205142e-06, "loss": 0.0007, "num_tokens": 2266123773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7467781855423743, "frac_reward_zero_std": 1.0, "grad_norm": 5.1597212668655e-21, "kl": 0.016937255859375, "learning_rate": 3.66513861273173e-06, "loss": 0.0007, "num_tokens": 2266687789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7469488776990697, "frac_reward_zero_std": 1.0, "grad_norm": 4.709943864213221e-21, "kl": 0.017333984375, "learning_rate": 3.6605297999054356e-06, "loss": 0.0007, "num_tokens": 2267254349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7471195698557651, "frac_reward_zero_std": 1.0, "grad_norm": 4.882918120436752e-21, "kl": 0.01678466796875, "learning_rate": 3.655923237362218e-06, "loss": 0.0007, "num_tokens": 2267817517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7472902620124605, "frac_reward_zero_std": 1.0, "grad_norm": 4.521921232045433e-21, "kl": 0.017181396484375, "learning_rate": 3.6513189267372417e-06, "loss": 0.0007, "num_tokens": 2268383421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7474609541691559, "frac_reward_zero_std": 1.0, "grad_norm": 4.6494090290366425e-21, "kl": 0.017486572265625, "learning_rate": 3.646716869664876e-06, "loss": 0.0007, "num_tokens": 2268947725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7476316463258513, "frac_reward_zero_std": 1.0, "grad_norm": 4.970500711964608e-21, "kl": 0.017608642578125, "learning_rate": 3.6421170677786787e-06, "loss": 0.0007, "num_tokens": 2269511693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7478023384825467, "frac_reward_zero_std": 1.0, "grad_norm": 4.371785015066291e-21, "kl": 0.016815185546875, "learning_rate": 3.637519522711418e-06, "loss": 0.0007, "num_tokens": 2270076893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7479730306392421, "frac_reward_zero_std": 1.0, "grad_norm": 5.6041798792640876e-21, "kl": 0.0174560546875, "learning_rate": 3.632924236095052e-06, "loss": 0.0007, "num_tokens": 2270646125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7481437227959375, "frac_reward_zero_std": 1.0, "grad_norm": 4.802222490980636e-21, "kl": 0.016998291015625, "learning_rate": 3.628331209560747e-06, "loss": 0.0007, "num_tokens": 2271212637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.748314414952633, "frac_reward_zero_std": 1.0, "grad_norm": 4.967048673349058e-21, "kl": 0.017364501953125, "learning_rate": 3.623740444738855e-06, "loss": 0.0007, "num_tokens": 2271780189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7484851071093284, "frac_reward_zero_std": 1.0, "grad_norm": 4.788814541588674e-21, "kl": 0.01690673828125, "learning_rate": 3.6191519432589395e-06, "loss": 0.0007, "num_tokens": 2272343309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7486557992660238, "frac_reward_zero_std": 1.0, "grad_norm": 4.585109417110134e-21, "kl": 0.017120361328125, "learning_rate": 3.614565706749744e-06, "loss": 0.0007, "num_tokens": 2272901677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7488264914227192, "frac_reward_zero_std": 1.0, "grad_norm": 4.3818570873012e-21, "kl": 0.017364501953125, "learning_rate": 3.6099817368392267e-06, "loss": 0.0007, "num_tokens": 2273467437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7489971835794145, "frac_reward_zero_std": 1.0, "grad_norm": 4.51295508562403e-21, "kl": 0.01751708984375, "learning_rate": 3.6054000351545217e-06, "loss": 0.0007, "num_tokens": 2274033325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7491678757361099, "frac_reward_zero_std": 1.0, "grad_norm": 4.950193840600326e-21, "kl": 0.0166015625, "learning_rate": 3.600820603321976e-06, "loss": 0.0007, "num_tokens": 2274597789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7493385678928053, "frac_reward_zero_std": 1.0, "grad_norm": 4.961757891353893e-21, "kl": 0.017242431640625, "learning_rate": 3.5962434429671158e-06, "loss": 0.0007, "num_tokens": 2275170445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7495092600495007, "frac_reward_zero_std": 1.0, "grad_norm": 4.610488310870987e-21, "kl": 0.016998291015625, "learning_rate": 3.5916685557146745e-06, "loss": 0.0007, "num_tokens": 2275739901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7496799522061961, "frac_reward_zero_std": 1.0, "grad_norm": 4.1886876416345095e-21, "kl": 0.017059326171875, "learning_rate": 3.5870959431885653e-06, "loss": 0.0007, "num_tokens": 2276305821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7498506443628915, "frac_reward_zero_std": 1.0, "grad_norm": 5.6161904331426024e-21, "kl": 0.0172119140625, "learning_rate": 3.582525607011906e-06, "loss": 0.0007, "num_tokens": 2276876909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7500213365195869, "frac_reward_zero_std": 1.0, "grad_norm": 4.752380222078569e-21, "kl": 0.017242431640625, "learning_rate": 3.577957548806997e-06, "loss": 0.0007, "num_tokens": 2277441597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7501920286762823, "frac_reward_zero_std": 1.0, "grad_norm": 4.7107085175644966e-21, "kl": 0.017242431640625, "learning_rate": 3.573391770195338e-06, "loss": 0.0007, "num_tokens": 2278006221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7503627208329777, "frac_reward_zero_std": 1.0, "grad_norm": 5.590798433423794e-21, "kl": 0.017547607421875, "learning_rate": 3.568828272797611e-06, "loss": 0.0007, "num_tokens": 2278570317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7505334129896731, "frac_reward_zero_std": 1.0, "grad_norm": 4.6055208224111986e-21, "kl": 0.016845703125, "learning_rate": 3.564267058233697e-06, "loss": 0.0007, "num_tokens": 2279133789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7507041051463685, "frac_reward_zero_std": 1.0, "grad_norm": 4.591337834885713e-21, "kl": 0.017364501953125, "learning_rate": 3.5597081281226587e-06, "loss": 0.0007, "num_tokens": 2279696669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.750874797303064, "frac_reward_zero_std": 1.0, "grad_norm": 4.3216409050577124e-21, "kl": 0.016754150390625, "learning_rate": 3.555151484082756e-06, "loss": 0.0007, "num_tokens": 2280259485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7510454894597594, "frac_reward_zero_std": 1.0, "grad_norm": 4.2472953972369476e-21, "kl": 0.01727294921875, "learning_rate": 3.5505971277314276e-06, "loss": 0.0007, "num_tokens": 2280822605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7512161816164548, "frac_reward_zero_std": 1.0, "grad_norm": 4.9777307726931595e-21, "kl": 0.017120361328125, "learning_rate": 3.54604506068531e-06, "loss": 0.0007, "num_tokens": 2281389357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7513868737731502, "frac_reward_zero_std": 1.0, "grad_norm": 4.8783001990059875e-21, "kl": 0.0167236328125, "learning_rate": 3.5414952845602235e-06, "loss": 0.0007, "num_tokens": 2281953581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7515575659298456, "frac_reward_zero_std": 1.0, "grad_norm": 5.278462413006055e-21, "kl": 0.017547607421875, "learning_rate": 3.5369478009711734e-06, "loss": 0.0007, "num_tokens": 2282514813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7517282580865409, "frac_reward_zero_std": 1.0, "grad_norm": 4.796957330951446e-21, "kl": 0.016815185546875, "learning_rate": 3.532402611532347e-06, "loss": 0.0007, "num_tokens": 2283076717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7518989502432363, "frac_reward_zero_std": 1.0, "grad_norm": 4.416850374005228e-21, "kl": 0.016754150390625, "learning_rate": 3.527859717857127e-06, "loss": 0.0007, "num_tokens": 2283637325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7520696423999317, "frac_reward_zero_std": 1.0, "grad_norm": 5.238079040677684e-21, "kl": 0.016876220703125, "learning_rate": 3.52331912155808e-06, "loss": 0.0007, "num_tokens": 2284202749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7522403345566271, "frac_reward_zero_std": 1.0, "grad_norm": 4.5480466445007675e-21, "kl": 0.01739501953125, "learning_rate": 3.518780824246949e-06, "loss": 0.0007, "num_tokens": 2284775981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7524110267133225, "frac_reward_zero_std": 1.0, "grad_norm": 4.7462860697791884e-21, "kl": 0.01702880859375, "learning_rate": 3.514244827534664e-06, "loss": 0.0007, "num_tokens": 2285338541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7525817188700179, "frac_reward_zero_std": 1.0, "grad_norm": 4.234470761221908e-21, "kl": 0.016510009765625, "learning_rate": 3.509711133031343e-06, "loss": 0.0007, "num_tokens": 2285904189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7527524110267133, "frac_reward_zero_std": 1.0, "grad_norm": 5.353676146588274e-21, "kl": 0.01654052734375, "learning_rate": 3.5051797423462873e-06, "loss": 0.0007, "num_tokens": 2286466733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7529231031834087, "frac_reward_zero_std": 1.0, "grad_norm": 4.731794533515727e-21, "kl": 0.017333984375, "learning_rate": 3.500650657087974e-06, "loss": 0.0007, "num_tokens": 2287036397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7530937953401041, "frac_reward_zero_std": 1.0, "grad_norm": 4.7736488515422354e-21, "kl": 0.016754150390625, "learning_rate": 3.4961238788640595e-06, "loss": 0.0007, "num_tokens": 2287599357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7532644874967995, "frac_reward_zero_std": 1.0, "grad_norm": 5.099576950480257e-21, "kl": 0.0174560546875, "learning_rate": 3.4915994092813933e-06, "loss": 0.0007, "num_tokens": 2288167821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7534351796534949, "frac_reward_zero_std": 1.0, "grad_norm": 4.617804715816037e-21, "kl": 0.016937255859375, "learning_rate": 3.4870772499459994e-06, "loss": 0.0007, "num_tokens": 2288731469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7536058718101903, "frac_reward_zero_std": 1.0, "grad_norm": 5.974327569695648e-21, "kl": 0.018341064453125, "learning_rate": 3.482557402463078e-06, "loss": 0.0007, "num_tokens": 2289298525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7537765639668857, "frac_reward_zero_std": 1.0, "grad_norm": 5.173205629035065e-21, "kl": 0.016845703125, "learning_rate": 3.4780398684370107e-06, "loss": 0.0007, "num_tokens": 2289868989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7539472561235812, "frac_reward_zero_std": 1.0, "grad_norm": 4.685925556206429e-21, "kl": 0.017120361328125, "learning_rate": 3.4735246494713605e-06, "loss": 0.0007, "num_tokens": 2290437053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7541179482802766, "frac_reward_zero_std": 1.0, "grad_norm": 4.878322117611364e-21, "kl": 0.0167236328125, "learning_rate": 3.4690117471688667e-06, "loss": 0.0007, "num_tokens": 2291005197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.754288640436972, "frac_reward_zero_std": 1.0, "grad_norm": 4.731770509887052e-21, "kl": 0.01702880859375, "learning_rate": 3.464501163131455e-06, "loss": 0.0007, "num_tokens": 2291580349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7544593325936674, "frac_reward_zero_std": 1.0, "grad_norm": 4.669230605610092e-21, "kl": 0.01690673828125, "learning_rate": 3.459992898960206e-06, "loss": 0.0007, "num_tokens": 2292143101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7546300247503627, "frac_reward_zero_std": 1.0, "grad_norm": 4.797224993624896e-21, "kl": 0.016754150390625, "learning_rate": 3.4554869562553973e-06, "loss": 0.0007, "num_tokens": 2292707197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7548007169070581, "frac_reward_zero_std": 1.0, "grad_norm": 4.1062758320870446e-21, "kl": 0.016632080078125, "learning_rate": 3.4509833366164746e-06, "loss": 0.0007, "num_tokens": 2293268909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7549714090637535, "frac_reward_zero_std": 1.0, "grad_norm": 4.440476864345259e-21, "kl": 0.016845703125, "learning_rate": 3.4464820416420653e-06, "loss": 0.0007, "num_tokens": 2293840189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7551421012204489, "frac_reward_zero_std": 1.0, "grad_norm": 4.375675957259508e-21, "kl": 0.016448974609375, "learning_rate": 3.4419830729299632e-06, "loss": 0.0007, "num_tokens": 2294402829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7553127933771443, "frac_reward_zero_std": 1.0, "grad_norm": 4.3970540010705865e-21, "kl": 0.016998291015625, "learning_rate": 3.4374864320771364e-06, "loss": 0.0007, "num_tokens": 2294965629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7554834855338397, "frac_reward_zero_std": 1.0, "grad_norm": 4.9978472608074256e-21, "kl": 0.0169677734375, "learning_rate": 3.432992120679731e-06, "loss": 0.0007, "num_tokens": 2295530605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7556541776905351, "frac_reward_zero_std": 1.0, "grad_norm": 5.183121510120199e-21, "kl": 0.0174560546875, "learning_rate": 3.4285001403330732e-06, "loss": 0.0007, "num_tokens": 2296102221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7558248698472305, "frac_reward_zero_std": 1.0, "grad_norm": 5.299175792513098e-21, "kl": 0.017181396484375, "learning_rate": 3.4240104926316466e-06, "loss": 0.0007, "num_tokens": 2296663325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7559955620039259, "frac_reward_zero_std": 1.0, "grad_norm": 5.136983557466323e-21, "kl": 0.017852783203125, "learning_rate": 3.419523179169113e-06, "loss": 0.0007, "num_tokens": 2297225741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7561662541606213, "frac_reward_zero_std": 1.0, "grad_norm": 5.002053569652842e-21, "kl": 0.01666259765625, "learning_rate": 3.415038201538309e-06, "loss": 0.0007, "num_tokens": 2297787389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7563369463173167, "frac_reward_zero_std": 1.0, "grad_norm": 5.267300508828864e-21, "kl": 0.016815185546875, "learning_rate": 3.4105555613312425e-06, "loss": 0.0007, "num_tokens": 2298350173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7565076384740121, "frac_reward_zero_std": 1.0, "grad_norm": 5.117137724412318e-21, "kl": 0.01678466796875, "learning_rate": 3.406075260139088e-06, "loss": 0.0007, "num_tokens": 2298914205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7566783306307076, "frac_reward_zero_std": 1.0, "grad_norm": 4.718960745760582e-21, "kl": 0.017120361328125, "learning_rate": 3.401597299552184e-06, "loss": 0.0007, "num_tokens": 2299479149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.756849022787403, "frac_reward_zero_std": 1.0, "grad_norm": 4.2413526634891044e-21, "kl": 0.016265869140625, "learning_rate": 3.3971216811600518e-06, "loss": 0.0007, "num_tokens": 2300040621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7570197149440984, "frac_reward_zero_std": 1.0, "grad_norm": 4.6977430294318605e-21, "kl": 0.016998291015625, "learning_rate": 3.3926484065513744e-06, "loss": 0.0007, "num_tokens": 2300601405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7571904071007938, "frac_reward_zero_std": 1.0, "grad_norm": 5.8606545479839764e-21, "kl": 0.017059326171875, "learning_rate": 3.388177477314002e-06, "loss": 0.0007, "num_tokens": 2301166541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7573610992574891, "frac_reward_zero_std": 1.0, "grad_norm": 4.587248269008616e-21, "kl": 0.01690673828125, "learning_rate": 3.3837088950349482e-06, "loss": 0.0007, "num_tokens": 2301734253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7575317914141845, "frac_reward_zero_std": 1.0, "grad_norm": 4.489093137600636e-21, "kl": 0.017120361328125, "learning_rate": 3.3792426613004026e-06, "loss": 0.0007, "num_tokens": 2302296013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7577024835708799, "frac_reward_zero_std": 1.0, "grad_norm": 5.061012004076338e-21, "kl": 0.01715087890625, "learning_rate": 3.3747787776957197e-06, "loss": 0.0007, "num_tokens": 2302863085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7578731757275753, "frac_reward_zero_std": 1.0, "grad_norm": 5.46070443349325e-21, "kl": 0.01739501953125, "learning_rate": 3.3703172458054114e-06, "loss": 0.0007, "num_tokens": 2303431373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7580438678842707, "frac_reward_zero_std": 1.0, "grad_norm": 5.08489279400209e-21, "kl": 0.017425537109375, "learning_rate": 3.3658580672131646e-06, "loss": 0.0007, "num_tokens": 2303998205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7582145600409661, "frac_reward_zero_std": 1.0, "grad_norm": 4.418153561754101e-21, "kl": 0.016815185546875, "learning_rate": 3.3614012435018226e-06, "loss": 0.0007, "num_tokens": 2304562045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7583852521976615, "frac_reward_zero_std": 1.0, "grad_norm": 4.6730372906078854e-21, "kl": 0.016357421875, "learning_rate": 3.3569467762534037e-06, "loss": 0.0007, "num_tokens": 2305123885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7585559443543569, "frac_reward_zero_std": 1.0, "grad_norm": 4.391356468929572e-21, "kl": 0.016632080078125, "learning_rate": 3.3524946670490753e-06, "loss": 0.0007, "num_tokens": 2305693565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7587266365110523, "frac_reward_zero_std": 1.0, "grad_norm": 4.662319766767352e-21, "kl": 0.017364501953125, "learning_rate": 3.348044917469182e-06, "loss": 0.0007, "num_tokens": 2306255165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7588973286677477, "frac_reward_zero_std": 1.0, "grad_norm": 4.83916556608883e-21, "kl": 0.016571044921875, "learning_rate": 3.3435975290932176e-06, "loss": 0.0007, "num_tokens": 2306833789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7590680208244431, "frac_reward_zero_std": 1.0, "grad_norm": 5.25901096749055e-21, "kl": 0.017242431640625, "learning_rate": 3.339152503499852e-06, "loss": 0.0007, "num_tokens": 2307396765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7592387129811385, "frac_reward_zero_std": 1.0, "grad_norm": 4.713491162912464e-21, "kl": 0.016876220703125, "learning_rate": 3.3347098422669e-06, "loss": 0.0007, "num_tokens": 2307959101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.759409405137834, "frac_reward_zero_std": 1.0, "grad_norm": 4.947964371322981e-21, "kl": 0.017059326171875, "learning_rate": 3.3302695469713553e-06, "loss": 0.0007, "num_tokens": 2308523917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7595800972945294, "frac_reward_zero_std": 1.0, "grad_norm": 4.464130457533119e-21, "kl": 0.016632080078125, "learning_rate": 3.3258316191893547e-06, "loss": 0.0007, "num_tokens": 2309086925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7597507894512248, "frac_reward_zero_std": 1.0, "grad_norm": 4.6599067245106896e-21, "kl": 0.017059326171875, "learning_rate": 3.321396060496209e-06, "loss": 0.0007, "num_tokens": 2309651565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7599214816079202, "frac_reward_zero_std": 1.0, "grad_norm": 5.065919173591556e-21, "kl": 0.01715087890625, "learning_rate": 3.316962872466375e-06, "loss": 0.0007, "num_tokens": 2310214877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7600921737646155, "frac_reward_zero_std": 1.0, "grad_norm": 5.071802254533686e-21, "kl": 0.016510009765625, "learning_rate": 3.3125320566734796e-06, "loss": 0.0007, "num_tokens": 2310788877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7602628659213109, "frac_reward_zero_std": 1.0, "grad_norm": 4.657797969508982e-21, "kl": 0.016876220703125, "learning_rate": 3.3081036146903013e-06, "loss": 0.0007, "num_tokens": 2311353117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7604335580780063, "frac_reward_zero_std": 1.0, "grad_norm": 5.66393361067789e-21, "kl": 0.017578125, "learning_rate": 3.303677548088773e-06, "loss": 0.0007, "num_tokens": 2311913805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7606042502347017, "frac_reward_zero_std": 1.0, "grad_norm": 4.7449622805993616e-21, "kl": 0.016632080078125, "learning_rate": 3.2992538584399913e-06, "loss": 0.0007, "num_tokens": 2312480093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7607749423913971, "frac_reward_zero_std": 1.0, "grad_norm": 5.19058113588142e-21, "kl": 0.017059326171875, "learning_rate": 3.2948325473142105e-06, "loss": 0.0007, "num_tokens": 2313048269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7609456345480925, "frac_reward_zero_std": 1.0, "grad_norm": 5.0857641505591845e-21, "kl": 0.016815185546875, "learning_rate": 3.290413616280832e-06, "loss": 0.0007, "num_tokens": 2313610477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7611163267047879, "frac_reward_zero_std": 1.0, "grad_norm": 4.796163531391903e-21, "kl": 0.016632080078125, "learning_rate": 3.285997066908415e-06, "loss": 0.0007, "num_tokens": 2314177709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7612870188614833, "frac_reward_zero_std": 1.0, "grad_norm": 4.700532379724206e-21, "kl": 0.01654052734375, "learning_rate": 3.2815829007646783e-06, "loss": 0.0007, "num_tokens": 2314740509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7614577110181787, "frac_reward_zero_std": 1.0, "grad_norm": 5.069968925139011e-21, "kl": 0.01702880859375, "learning_rate": 3.27717111941649e-06, "loss": 0.0007, "num_tokens": 2315302093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7616284031748741, "frac_reward_zero_std": 1.0, "grad_norm": 5.794761252886416e-21, "kl": 0.016876220703125, "learning_rate": 3.2727617244298805e-06, "loss": 0.0007, "num_tokens": 2315867357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7617990953315695, "frac_reward_zero_std": 1.0, "grad_norm": 4.8365343211195055e-21, "kl": 0.017730712890625, "learning_rate": 3.2683547173700135e-06, "loss": 0.0007, "num_tokens": 2316438637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7619697874882649, "frac_reward_zero_std": 1.0, "grad_norm": 4.650783993181284e-21, "kl": 0.016845703125, "learning_rate": 3.2639500998012232e-06, "loss": 0.0007, "num_tokens": 2317000621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7621404796449603, "frac_reward_zero_std": 1.0, "grad_norm": 4.9560502339013154e-21, "kl": 0.016815185546875, "learning_rate": 3.2595478732869902e-06, "loss": 0.0007, "num_tokens": 2317572845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7623111718016558, "frac_reward_zero_std": 1.0, "grad_norm": 4.630883908417974e-21, "kl": 0.016998291015625, "learning_rate": 3.255148039389948e-06, "loss": 0.0007, "num_tokens": 2318145165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7624818639583512, "frac_reward_zero_std": 1.0, "grad_norm": 4.5854681128213074e-21, "kl": 0.01708984375, "learning_rate": 3.250750599671878e-06, "loss": 0.0007, "num_tokens": 2318711357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7626525561150466, "frac_reward_zero_std": 1.0, "grad_norm": 4.889273051744445e-21, "kl": 0.01727294921875, "learning_rate": 3.2463555556937075e-06, "loss": 0.0007, "num_tokens": 2319281229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7628232482717419, "frac_reward_zero_std": 1.0, "grad_norm": 5.3960038175510206e-21, "kl": 0.017059326171875, "learning_rate": 3.2419629090155215e-06, "loss": 0.0007, "num_tokens": 2319845933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7629939404284373, "frac_reward_zero_std": 1.0, "grad_norm": 4.774083351537124e-21, "kl": 0.016998291015625, "learning_rate": 3.2375726611965552e-06, "loss": 0.0007, "num_tokens": 2320413789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7631646325851327, "frac_reward_zero_std": 1.0, "grad_norm": 4.348347643343492e-21, "kl": 0.016845703125, "learning_rate": 3.233184813795185e-06, "loss": 0.0007, "num_tokens": 2320991021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7633353247418281, "frac_reward_zero_std": 1.0, "grad_norm": 4.946751698221865e-21, "kl": 0.017059326171875, "learning_rate": 3.2287993683689347e-06, "loss": 0.0007, "num_tokens": 2321553677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7635060168985235, "frac_reward_zero_std": 1.0, "grad_norm": 5.2129390391168254e-21, "kl": 0.01690673828125, "learning_rate": 3.224416326474482e-06, "loss": 0.0007, "num_tokens": 2322119709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7636767090552189, "frac_reward_zero_std": 1.0, "grad_norm": 5.14818964984569e-21, "kl": 0.01702880859375, "learning_rate": 3.220035689667652e-06, "loss": 0.0007, "num_tokens": 2322688461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7638474012119143, "frac_reward_zero_std": 1.0, "grad_norm": 4.239650445390782e-21, "kl": 0.016815185546875, "learning_rate": 3.2156574595034107e-06, "loss": 0.0007, "num_tokens": 2323258157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7640180933686097, "frac_reward_zero_std": 1.0, "grad_norm": 4.996385244044785e-21, "kl": 0.01702880859375, "learning_rate": 3.211281637535866e-06, "loss": 0.0007, "num_tokens": 2323825725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7641887855253051, "frac_reward_zero_std": 1.0, "grad_norm": 4.467002461391157e-21, "kl": 0.01702880859375, "learning_rate": 3.2069082253182813e-06, "loss": 0.0007, "num_tokens": 2324387325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7643594776820005, "frac_reward_zero_std": 1.0, "grad_norm": 4.9232438201245075e-21, "kl": 0.017059326171875, "learning_rate": 3.2025372244030638e-06, "loss": 0.0007, "num_tokens": 2324953901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7645301698386959, "frac_reward_zero_std": 1.0, "grad_norm": 4.9207372711719845e-21, "kl": 0.016693115234375, "learning_rate": 3.1981686363417573e-06, "loss": 0.0007, "num_tokens": 2325518477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7647008619953913, "frac_reward_zero_std": 1.0, "grad_norm": 4.817047284496553e-21, "kl": 0.016937255859375, "learning_rate": 3.1938024626850507e-06, "loss": 0.0007, "num_tokens": 2326087853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7648715541520867, "frac_reward_zero_std": 1.0, "grad_norm": 4.7617260591121786e-21, "kl": 0.016845703125, "learning_rate": 3.189438704982779e-06, "loss": 0.0007, "num_tokens": 2326653245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7650422463087821, "frac_reward_zero_std": 1.0, "grad_norm": 4.568703843233492e-21, "kl": 0.01690673828125, "learning_rate": 3.185077364783923e-06, "loss": 0.0007, "num_tokens": 2327215101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7652129384654776, "frac_reward_zero_std": 1.0, "grad_norm": 4.7202973950718015e-21, "kl": 0.016876220703125, "learning_rate": 3.1807184436365944e-06, "loss": 0.0007, "num_tokens": 2327785149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.765383630622173, "frac_reward_zero_std": 1.0, "grad_norm": 4.8881740129943664e-21, "kl": 0.01739501953125, "learning_rate": 3.176361943088061e-06, "loss": 0.0007, "num_tokens": 2328352157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7655543227788683, "frac_reward_zero_std": 1.0, "grad_norm": 4.747279237457259e-21, "kl": 0.017333984375, "learning_rate": 3.172007864684714e-06, "loss": 0.0007, "num_tokens": 2328918669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7657250149355637, "frac_reward_zero_std": 1.0, "grad_norm": 5.185312919579686e-21, "kl": 0.017181396484375, "learning_rate": 3.167656209972103e-06, "loss": 0.0007, "num_tokens": 2329484909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7658957070922591, "frac_reward_zero_std": 1.0, "grad_norm": 4.472750213220342e-21, "kl": 0.017059326171875, "learning_rate": 3.1633069804949004e-06, "loss": 0.0007, "num_tokens": 2330047949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7660663992489545, "frac_reward_zero_std": 1.0, "grad_norm": 8.061310384884777e+18, "kl": 5.519724293295964e+17, "learning_rate": 3.1589601777969314e-06, "loss": 2.2095785671786496e+16, "num_tokens": 2330648765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7662370914056499, "frac_reward_zero_std": 1.0, "grad_norm": 8.275015516618845e-20, "kl": 0.017120361328125, "learning_rate": 3.1546158034211504e-06, "loss": 0.0007, "num_tokens": 2331212605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7664077835623453, "frac_reward_zero_std": 1.0, "grad_norm": 1.4594551762570485e-18, "kl": 0.01751708984375, "learning_rate": 3.150273858909657e-06, "loss": 0.0007, "num_tokens": 2331783725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7665784757190407, "frac_reward_zero_std": 1.0, "grad_norm": 1.5958291929605606e-17, "kl": 0.01678466796875, "learning_rate": 3.1459343458036807e-06, "loss": 0.0007, "num_tokens": 2332361805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7667491678757361, "frac_reward_zero_std": 1.0, "grad_norm": 1.5669025245388158e-16, "kl": 0.01739501953125, "learning_rate": 3.1415972656435966e-06, "loss": 0.0007, "num_tokens": 2332929245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7669198600324315, "frac_reward_zero_std": 1.0, "grad_norm": 9.221304138067619e-16, "kl": 0.017059326171875, "learning_rate": 3.1372626199689062e-06, "loss": 0.0007, "num_tokens": 2333493421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7670905521891269, "frac_reward_zero_std": 1.0, "grad_norm": 5.980376258433348e-15, "kl": 0.016357421875, "learning_rate": 3.132930410318258e-06, "loss": 0.0007, "num_tokens": 2334061037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7672612443458223, "frac_reward_zero_std": 1.0, "grad_norm": 2.7343930958982343e-14, "kl": 0.017059326171875, "learning_rate": 3.128600638229423e-06, "loss": 0.0007, "num_tokens": 2334633021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7674319365025177, "frac_reward_zero_std": 1.0, "grad_norm": 9.366614752925509e-14, "kl": 0.016693115234375, "learning_rate": 3.1242733052393227e-06, "loss": 0.0007, "num_tokens": 2335199037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7676026286592131, "frac_reward_zero_std": 1.0, "grad_norm": 3.6884472056366455e-13, "kl": 0.016937255859375, "learning_rate": 3.1199484128839965e-06, "loss": 0.0007, "num_tokens": 2335765677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7677733208159085, "frac_reward_zero_std": 1.0, "grad_norm": 1.0493389787608815e-12, "kl": 0.016632080078125, "learning_rate": 3.115625962698631e-06, "loss": 0.0007, "num_tokens": 2336330685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.767944012972604, "frac_reward_zero_std": 1.0, "grad_norm": 3.395549945277623e-12, "kl": 0.01727294921875, "learning_rate": 3.111305956217533e-06, "loss": 0.0007, "num_tokens": 2336901245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7681147051292994, "frac_reward_zero_std": 1.0, "grad_norm": 1.2575694390454208e-11, "kl": 0.016632080078125, "learning_rate": 3.1069883949741575e-06, "loss": 0.0007, "num_tokens": 2337461389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7682853972859947, "frac_reward_zero_std": 1.0, "grad_norm": 6.534736620287196e-11, "kl": 0.016876220703125, "learning_rate": 3.102673280501074e-06, "loss": 0.0007, "num_tokens": 2338030877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7684560894426901, "frac_reward_zero_std": 1.0, "grad_norm": 1.9318307872599004e-10, "kl": 0.01690673828125, "learning_rate": 3.0983606143300006e-06, "loss": 0.0007, "num_tokens": 2338592749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7686267815993855, "frac_reward_zero_std": 1.0, "grad_norm": 7.94977291661562e-10, "kl": 0.016845703125, "learning_rate": 3.0940503979917715e-06, "loss": 0.0007, "num_tokens": 2339157501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7687974737560809, "frac_reward_zero_std": 1.0, "grad_norm": 4.521254827562759e-06, "kl": 0.01666259765625, "learning_rate": 3.0897426330163594e-06, "loss": 0.0007, "num_tokens": 2339727405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7689681659127763, "frac_reward_zero_std": 1.0, "grad_norm": 6.880981442964121e-05, "kl": 0.016998291015625, "learning_rate": 3.0854373209328704e-06, "loss": 0.0007, "num_tokens": 2340302077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7691388580694717, "frac_reward_zero_std": 1.0, "grad_norm": 9.68846244460208e-05, "kl": 0.016815185546875, "learning_rate": 3.0811344632695316e-06, "loss": 0.0007, "num_tokens": 2340870253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7693095502261671, "frac_reward_zero_std": 1.0, "grad_norm": 0.00030248071946126253, "kl": 0.017608642578125, "learning_rate": 3.0768340615536975e-06, "loss": 0.0007, "num_tokens": 2341435965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7694802423828625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004604331365522239, "kl": 0.01788330078125, "learning_rate": 3.0725361173118604e-06, "loss": 0.0007, "num_tokens": 2342002253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7696509345395579, "frac_reward_zero_std": 1.0, "grad_norm": 0.013839313112540866, "kl": 0.0189208984375, "learning_rate": 3.0682406320696377e-06, "loss": 0.0008, "num_tokens": 2342562461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7698216266962533, "frac_reward_zero_std": 1.0, "grad_norm": 0.006679601536565994, "kl": 0.01898193359375, "learning_rate": 3.063947607351768e-06, "loss": 0.0008, "num_tokens": 2343124685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7699923188529487, "frac_reward_zero_std": 1.0, "grad_norm": 0.4659615758152763, "kl": 0.04608154296875, "learning_rate": 3.059657044682117e-06, "loss": 0.0018, "num_tokens": 2343690781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7701630110096441, "frac_reward_zero_std": 1.0, "grad_norm": 1781.2306624312228, "kl": 205.8125, "learning_rate": 3.0553689455836844e-06, "loss": 8.2296, "num_tokens": 2344259773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 138.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 108.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 80.0, "completions/min_terminated_length": 0.0, "epoch": 0.7703337031663395, "frac_reward_zero_std": 1.0, "grad_norm": 741149.5602700814, "kl": 43392.0, "learning_rate": 3.051083311578592e-06, "loss": 1733.751, "num_tokens": 2344328909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 88.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 68.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 50.0, "completions/min_terminated_length": 0.0, "epoch": 0.7705043953230349, "frac_reward_zero_std": 1.0, "grad_norm": 1.268882120759836, "kl": 4.5703125, "learning_rate": 3.0468001441880823e-06, "loss": 0.1828, "num_tokens": 2344386237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 84.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 66.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 53.0, "completions/min_terminated_length": 0.0, "epoch": 0.7706750874797303, "frac_reward_zero_std": 1.0, "grad_norm": 3.2600300636255986, "kl": 5.1328125, "learning_rate": 3.042519444932522e-06, "loss": 0.2053, "num_tokens": 2344442685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 114.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 73.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 54.0, "completions/min_terminated_length": 0.0, "epoch": 0.7708457796364258, "frac_reward_zero_std": 1.0, "grad_norm": 3.310454879965002, "kl": 5.9375, "learning_rate": 3.038241215331409e-06, "loss": 0.2378, "num_tokens": 2344501965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 242.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 189.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 157.0, "completions/min_terminated_length": 0.0, "epoch": 0.7710164717931212, "frac_reward_zero_std": 1.0, "grad_norm": 1.7818230599608906, "kl": 4.5546875, "learning_rate": 3.0339654569033616e-06, "loss": 0.1821, "num_tokens": 2344585021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 788.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 681.25, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.7711871639498165, "frac_reward_zero_std": 1.0, "grad_norm": 1.1832059601784515, "kl": 3.609375, "learning_rate": 3.029692171166119e-06, "loss": 0.1445, "num_tokens": 2344798925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1901.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1720.625, "completions/mean_terminated_length": 0.0, "completions/min_length": 1515.0, "completions/min_terminated_length": 0.0, "epoch": 0.7713578561065119, "frac_reward_zero_std": 1.0, "grad_norm": 1.8300596863827796, "kl": 3.44140625, "learning_rate": 3.0254213596365367e-06, "loss": 0.1378, "num_tokens": 2345279997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1557.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1204.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 954.0, "completions/min_terminated_length": 0.0, "epoch": 0.7715285482632073, "frac_reward_zero_std": 1.0, "grad_norm": 0.7073149250223275, "kl": 3.4296875, "learning_rate": 3.021153023830605e-06, "loss": 0.1372, "num_tokens": 2345628285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 848.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 591.5, "completions/mean_terminated_length": 0.0, "completions/min_length": 370.0, "completions/min_terminated_length": 0.0, "epoch": 0.7716992404199027, "frac_reward_zero_std": 1.0, "grad_norm": 3.0241425623650535, "kl": 3.30078125, "learning_rate": 3.016887165263428e-06, "loss": 0.132, "num_tokens": 2345816861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2028.125, "completions/mean_terminated_length": 0.0, "completions/min_length": 1889.0, "completions/min_terminated_length": 0.0, "epoch": 0.7718699325765981, "frac_reward_zero_std": 1.0, "grad_norm": 901.792871149894, "kl": 113.53125, "learning_rate": 3.012623785449238e-06, "loss": 4.552, "num_tokens": 2346372701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7720406247332935, "frac_reward_zero_std": 1.0, "grad_norm": 298.58853832700765, "kl": 37.9453125, "learning_rate": 3.0083628859013682e-06, "loss": 1.5222, "num_tokens": 2346934189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7722113168899889, "frac_reward_zero_std": 1.0, "grad_norm": 6.0854545705513505, "kl": 0.90673828125, "learning_rate": 3.0041044681322895e-06, "loss": 0.0362, "num_tokens": 2347498477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7723820090466843, "frac_reward_zero_std": 1.0, "grad_norm": 1.5391920091039872, "kl": 0.2880859375, "learning_rate": 2.999848533653588e-06, "loss": 0.0115, "num_tokens": 2348070557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7725527012033797, "frac_reward_zero_std": 1.0, "grad_norm": 1.0884852607378115, "kl": 0.2269287109375, "learning_rate": 2.9955950839759685e-06, "loss": 0.0091, "num_tokens": 2348629421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7727233933600751, "frac_reward_zero_std": 1.0, "grad_norm": 0.004171172206531654, "kl": 0.0804443359375, "learning_rate": 2.991344120609251e-06, "loss": 0.0032, "num_tokens": 2349192061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7728940855167705, "frac_reward_zero_std": 1.0, "grad_norm": 3.142569106415635e-07, "kl": 0.0682373046875, "learning_rate": 2.987095645062368e-06, "loss": 0.0027, "num_tokens": 2349757677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7730647776734659, "frac_reward_zero_std": 1.0, "grad_norm": 1.954926898445378e-07, "kl": 0.0733642578125, "learning_rate": 2.982849658843381e-06, "loss": 0.0029, "num_tokens": 2350320477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7732354698301613, "frac_reward_zero_std": 1.0, "grad_norm": 1.0964619610885955e-07, "kl": 0.0701904296875, "learning_rate": 2.978606163459462e-06, "loss": 0.0028, "num_tokens": 2350882189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7734061619868567, "frac_reward_zero_std": 1.0, "grad_norm": 6.308486312938562e-08, "kl": 0.0723876953125, "learning_rate": 2.9743651604168987e-06, "loss": 0.0029, "num_tokens": 2351446781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7735768541435522, "frac_reward_zero_std": 1.0, "grad_norm": 4.164769204481051e-08, "kl": 0.069091796875, "learning_rate": 2.970126651221089e-06, "loss": 0.0028, "num_tokens": 2352009117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7737475463002476, "frac_reward_zero_std": 1.0, "grad_norm": 2.594452701391721e-08, "kl": 0.0673828125, "learning_rate": 2.965890637376554e-06, "loss": 0.0027, "num_tokens": 2352576237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7739182384569429, "frac_reward_zero_std": 1.0, "grad_norm": 1.851869732914689e-08, "kl": 0.0750732421875, "learning_rate": 2.961657120386929e-06, "loss": 0.003, "num_tokens": 2353140813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7740889306136383, "frac_reward_zero_std": 1.0, "grad_norm": 9.519924781876164e-09, "kl": 0.07666015625, "learning_rate": 2.957426101754958e-06, "loss": 0.0031, "num_tokens": 2353705389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7742596227703337, "frac_reward_zero_std": 1.0, "grad_norm": 6.840073898687954e-09, "kl": 0.072509765625, "learning_rate": 2.9531975829824946e-06, "loss": 0.0029, "num_tokens": 2354270653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7744303149270291, "frac_reward_zero_std": 1.0, "grad_norm": 4.435161809398528e-09, "kl": 0.0726318359375, "learning_rate": 2.948971565570514e-06, "loss": 0.0029, "num_tokens": 2354839789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7746010070837245, "frac_reward_zero_std": 1.0, "grad_norm": 4.64377556704642e-09, "kl": 0.0755615234375, "learning_rate": 2.944748051019104e-06, "loss": 0.003, "num_tokens": 2355401389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7747716992404199, "frac_reward_zero_std": 1.0, "grad_norm": 3.9970169697803706e-09, "kl": 0.0697021484375, "learning_rate": 2.9405270408274545e-06, "loss": 0.0028, "num_tokens": 2355961965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7749423913971153, "frac_reward_zero_std": 1.0, "grad_norm": 2.5255252997842562e-09, "kl": 0.069580078125, "learning_rate": 2.9363085364938717e-06, "loss": 0.0028, "num_tokens": 2356529581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7751130835538107, "frac_reward_zero_std": 1.0, "grad_norm": 2.201690138352292e-09, "kl": 0.07080078125, "learning_rate": 2.9320925395157728e-06, "loss": 0.0028, "num_tokens": 2357093309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7752837757105061, "frac_reward_zero_std": 1.0, "grad_norm": 1.3805351080741808e-09, "kl": 0.0712890625, "learning_rate": 2.9278790513896884e-06, "loss": 0.0028, "num_tokens": 2357659629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7754544678672015, "frac_reward_zero_std": 1.0, "grad_norm": 2.239829084911625e-09, "kl": 0.0755615234375, "learning_rate": 2.923668073611252e-06, "loss": 0.003, "num_tokens": 2358221773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7756251600238969, "frac_reward_zero_std": 1.0, "grad_norm": 1.6645362120513126e-09, "kl": 0.0692138671875, "learning_rate": 2.9194596076752067e-06, "loss": 0.0028, "num_tokens": 2358788141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7757958521805923, "frac_reward_zero_std": 1.0, "grad_norm": 1.6512299684839047e-09, "kl": 0.0694580078125, "learning_rate": 2.9152536550754062e-06, "loss": 0.0028, "num_tokens": 2359351469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7759665443372877, "frac_reward_zero_std": 1.0, "grad_norm": 1.4980262242373642e-09, "kl": 0.076416015625, "learning_rate": 2.911050217304817e-06, "loss": 0.0031, "num_tokens": 2359914653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7761372364939831, "frac_reward_zero_std": 1.0, "grad_norm": 1.0213568791704795e-09, "kl": 0.0716552734375, "learning_rate": 2.9068492958555016e-06, "loss": 0.0029, "num_tokens": 2360480397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7763079286506785, "frac_reward_zero_std": 1.0, "grad_norm": 9.268736538312439e-10, "kl": 0.0726318359375, "learning_rate": 2.9026508922186416e-06, "loss": 0.0029, "num_tokens": 2361048205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.776478620807374, "frac_reward_zero_std": 1.0, "grad_norm": 1.0201269052728605e-09, "kl": 0.068359375, "learning_rate": 2.898455007884512e-06, "loss": 0.0027, "num_tokens": 2361616237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7766493129640692, "frac_reward_zero_std": 1.0, "grad_norm": 1.1714231332552927e-09, "kl": 0.072021484375, "learning_rate": 2.8942616443425064e-06, "loss": 0.0029, "num_tokens": 2362176781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7768200051207647, "frac_reward_zero_std": 1.0, "grad_norm": 1.1316463345180203e-09, "kl": 0.0731201171875, "learning_rate": 2.8900708030811108e-06, "loss": 0.0029, "num_tokens": 2362736829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7769906972774601, "frac_reward_zero_std": 1.0, "grad_norm": 1.0118799357918065e-09, "kl": 0.07080078125, "learning_rate": 2.8858824855879307e-06, "loss": 0.0028, "num_tokens": 2363304861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7771613894341555, "frac_reward_zero_std": 1.0, "grad_norm": 9.222688521198052e-10, "kl": 0.07177734375, "learning_rate": 2.8816966933496595e-06, "loss": 0.0029, "num_tokens": 2363864349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7773320815908509, "frac_reward_zero_std": 1.0, "grad_norm": 6.651755896265667e-10, "kl": 0.068115234375, "learning_rate": 2.87751342785211e-06, "loss": 0.0027, "num_tokens": 2364429901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7775027737475463, "frac_reward_zero_std": 1.0, "grad_norm": 7.716361410457028e-10, "kl": 0.0738525390625, "learning_rate": 2.8733326905801827e-06, "loss": 0.003, "num_tokens": 2364995197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7776734659042417, "frac_reward_zero_std": 1.0, "grad_norm": 9.142912194161831e-10, "kl": 0.0716552734375, "learning_rate": 2.8691544830178954e-06, "loss": 0.0029, "num_tokens": 2365562157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7778441580609371, "frac_reward_zero_std": 1.0, "grad_norm": 9.813429981989356e-10, "kl": 0.070556640625, "learning_rate": 2.864978806648355e-06, "loss": 0.0028, "num_tokens": 2366123469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7780148502176325, "frac_reward_zero_std": 1.0, "grad_norm": 7.217823251224768e-10, "kl": 0.07080078125, "learning_rate": 2.860805662953783e-06, "loss": 0.0028, "num_tokens": 2366694205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7781855423743279, "frac_reward_zero_std": 1.0, "grad_norm": 6.512702634609991e-10, "kl": 0.0701904296875, "learning_rate": 2.8566350534154865e-06, "loss": 0.0028, "num_tokens": 2367257501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7783562345310233, "frac_reward_zero_std": 1.0, "grad_norm": 4.2162426498468456e-10, "kl": 0.07177734375, "learning_rate": 2.852466979513888e-06, "loss": 0.0029, "num_tokens": 2367829293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7785269266877187, "frac_reward_zero_std": 1.0, "grad_norm": 8.347432590511397e-10, "kl": 0.07373046875, "learning_rate": 2.8483014427284973e-06, "loss": 0.0029, "num_tokens": 2368392893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7786976188444141, "frac_reward_zero_std": 1.0, "grad_norm": 6.508671753587795e-10, "kl": 0.070068359375, "learning_rate": 2.844138444537937e-06, "loss": 0.0028, "num_tokens": 2368955245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7788683110011095, "frac_reward_zero_std": 1.0, "grad_norm": 6.02829098107055e-10, "kl": 0.06787109375, "learning_rate": 2.839977986419914e-06, "loss": 0.0027, "num_tokens": 2369519437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7790390031578049, "frac_reward_zero_std": 1.0, "grad_norm": 4.539452192952833e-10, "kl": 0.0684814453125, "learning_rate": 2.835820069851243e-06, "loss": 0.0027, "num_tokens": 2370091725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7792096953145004, "frac_reward_zero_std": 1.0, "grad_norm": 4.914624970087827e-10, "kl": 0.069091796875, "learning_rate": 2.8316646963078386e-06, "loss": 0.0028, "num_tokens": 2370659597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7793803874711956, "frac_reward_zero_std": 1.0, "grad_norm": 5.116363269017046e-10, "kl": 0.0703125, "learning_rate": 2.8275118672647063e-06, "loss": 0.0028, "num_tokens": 2371224493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.779551079627891, "frac_reward_zero_std": 1.0, "grad_norm": 6.65706652775607e-10, "kl": 0.0731201171875, "learning_rate": 2.823361584195944e-06, "loss": 0.0029, "num_tokens": 2371786957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7797217717845865, "frac_reward_zero_std": 1.0, "grad_norm": 6.215812309322745e-10, "kl": 0.068359375, "learning_rate": 2.8192138485747587e-06, "loss": 0.0027, "num_tokens": 2372353245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7798924639412819, "frac_reward_zero_std": 1.0, "grad_norm": 5.52499338570234e-10, "kl": 0.0699462890625, "learning_rate": 2.815068661873449e-06, "loss": 0.0028, "num_tokens": 2372920957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7800631560979773, "frac_reward_zero_std": 1.0, "grad_norm": 5.607247474792697e-10, "kl": 0.072021484375, "learning_rate": 2.8109260255634018e-06, "loss": 0.0029, "num_tokens": 2373486269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7802338482546727, "frac_reward_zero_std": 1.0, "grad_norm": 6.264292197290363e-10, "kl": 0.0709228515625, "learning_rate": 2.8067859411151034e-06, "loss": 0.0028, "num_tokens": 2374050701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7804045404113681, "frac_reward_zero_std": 1.0, "grad_norm": 6.868938495247794e-10, "kl": 0.0716552734375, "learning_rate": 2.8026484099981354e-06, "loss": 0.0029, "num_tokens": 2374614541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7805752325680635, "frac_reward_zero_std": 1.0, "grad_norm": 5.755381197774817e-10, "kl": 0.0736083984375, "learning_rate": 2.798513433681175e-06, "loss": 0.0029, "num_tokens": 2375181533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7807459247247589, "frac_reward_zero_std": 1.0, "grad_norm": 7.485227855516502e-10, "kl": 0.07080078125, "learning_rate": 2.7943810136319873e-06, "loss": 0.0028, "num_tokens": 2375742893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7809166168814543, "frac_reward_zero_std": 1.0, "grad_norm": 6.05324987686981e-10, "kl": 0.0704345703125, "learning_rate": 2.790251151317428e-06, "loss": 0.0028, "num_tokens": 2376308173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7810873090381497, "frac_reward_zero_std": 1.0, "grad_norm": 6.801507828470901e-10, "kl": 0.07177734375, "learning_rate": 2.7861238482034535e-06, "loss": 0.0029, "num_tokens": 2376870605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7812580011948451, "frac_reward_zero_std": 1.0, "grad_norm": 6.124032925973303e-10, "kl": 0.0716552734375, "learning_rate": 2.78199910575511e-06, "loss": 0.0029, "num_tokens": 2377442141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7814286933515405, "frac_reward_zero_std": 1.0, "grad_norm": 5.401084229954715e-10, "kl": 0.071044921875, "learning_rate": 2.777876925436529e-06, "loss": 0.0028, "num_tokens": 2378008477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7815993855082359, "frac_reward_zero_std": 1.0, "grad_norm": 5.30880963462684e-10, "kl": 0.0675048828125, "learning_rate": 2.7737573087109336e-06, "loss": 0.0027, "num_tokens": 2378577549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7817700776649313, "frac_reward_zero_std": 1.0, "grad_norm": 6.267967812799549e-10, "kl": 0.0677490234375, "learning_rate": 2.76964025704064e-06, "loss": 0.0027, "num_tokens": 2379143981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7819407698216267, "frac_reward_zero_std": 1.0, "grad_norm": 7.297944686703646e-10, "kl": 0.07275390625, "learning_rate": 2.7655257718870544e-06, "loss": 0.0029, "num_tokens": 2379705405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.782111461978322, "frac_reward_zero_std": 1.0, "grad_norm": 6.992491922129927e-10, "kl": 0.0703125, "learning_rate": 2.761413854710677e-06, "loss": 0.0028, "num_tokens": 2380268317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7822821541350174, "frac_reward_zero_std": 1.0, "grad_norm": 6.303571154294575e-10, "kl": 0.0721435546875, "learning_rate": 2.757304506971077e-06, "loss": 0.0029, "num_tokens": 2380835853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7824528462917129, "frac_reward_zero_std": 1.0, "grad_norm": 4.916324255069441e-10, "kl": 0.0667724609375, "learning_rate": 2.7531977301269298e-06, "loss": 0.0027, "num_tokens": 2381403917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7826235384484083, "frac_reward_zero_std": 1.0, "grad_norm": 5.504962996920165e-10, "kl": 0.0662841796875, "learning_rate": 2.7490935256359953e-06, "loss": 0.0026, "num_tokens": 2381970669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7827942306051037, "frac_reward_zero_std": 1.0, "grad_norm": 5.674629764375141e-10, "kl": 0.0762939453125, "learning_rate": 2.744991894955118e-06, "loss": 0.003, "num_tokens": 2382538637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7829649227617991, "frac_reward_zero_std": 1.0, "grad_norm": 7.317588206611154e-10, "kl": 0.0721435546875, "learning_rate": 2.740892839540229e-06, "loss": 0.0029, "num_tokens": 2383102829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7831356149184945, "frac_reward_zero_std": 1.0, "grad_norm": 8.631153876425643e-10, "kl": 0.070556640625, "learning_rate": 2.736796360846339e-06, "loss": 0.0028, "num_tokens": 2383662205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7833063070751899, "frac_reward_zero_std": 1.0, "grad_norm": 6.309455530191024e-10, "kl": 0.0714111328125, "learning_rate": 2.732702460327554e-06, "loss": 0.0029, "num_tokens": 2384227517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7834769992318853, "frac_reward_zero_std": 1.0, "grad_norm": 4.702713934434922e-10, "kl": 0.065185546875, "learning_rate": 2.728611139437065e-06, "loss": 0.0026, "num_tokens": 2384793421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7836476913885807, "frac_reward_zero_std": 1.0, "grad_norm": 5.908975047208718e-10, "kl": 0.0714111328125, "learning_rate": 2.7245223996271398e-06, "loss": 0.0029, "num_tokens": 2385360733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7838183835452761, "frac_reward_zero_std": 1.0, "grad_norm": 5.031550640847982e-10, "kl": 0.06982421875, "learning_rate": 2.7204362423491294e-06, "loss": 0.0028, "num_tokens": 2385926029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7839890757019715, "frac_reward_zero_std": 1.0, "grad_norm": 4.861376227488834e-10, "kl": 0.0723876953125, "learning_rate": 2.716352669053476e-06, "loss": 0.0029, "num_tokens": 2386498445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7841597678586669, "frac_reward_zero_std": 1.0, "grad_norm": 6.389501708140214e-10, "kl": 0.072021484375, "learning_rate": 2.712271681189703e-06, "loss": 0.0029, "num_tokens": 2387059245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7843304600153623, "frac_reward_zero_std": 1.0, "grad_norm": 4.989057128311012e-10, "kl": 0.0679931640625, "learning_rate": 2.7081932802064113e-06, "loss": 0.0027, "num_tokens": 2387630941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7845011521720577, "frac_reward_zero_std": 1.0, "grad_norm": 4.1438761993164634e-10, "kl": 0.06884765625, "learning_rate": 2.704117467551284e-06, "loss": 0.0028, "num_tokens": 2388199949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7846718443287531, "frac_reward_zero_std": 1.0, "grad_norm": 6.036275042008137e-10, "kl": 0.0706787109375, "learning_rate": 2.7000442446710885e-06, "loss": 0.0028, "num_tokens": 2388765197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7848425364854484, "frac_reward_zero_std": 1.0, "grad_norm": 6.510316077419957e-10, "kl": 0.0733642578125, "learning_rate": 2.6959736130116763e-06, "loss": 0.0029, "num_tokens": 2389331053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7850132286421438, "frac_reward_zero_std": 1.0, "grad_norm": 5.00293864395609e-10, "kl": 0.070068359375, "learning_rate": 2.691905574017971e-06, "loss": 0.0028, "num_tokens": 2389898957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7851839207988393, "frac_reward_zero_std": 1.0, "grad_norm": 5.865572357894761e-10, "kl": 0.070556640625, "learning_rate": 2.6878401291339774e-06, "loss": 0.0028, "num_tokens": 2390465437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7853546129555347, "frac_reward_zero_std": 1.0, "grad_norm": 4.863346790660746e-10, "kl": 0.0697021484375, "learning_rate": 2.683777279802784e-06, "loss": 0.0028, "num_tokens": 2391032765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7855253051122301, "frac_reward_zero_std": 1.0, "grad_norm": 6.826891473606739e-10, "kl": 0.074462890625, "learning_rate": 2.6797170274665584e-06, "loss": 0.003, "num_tokens": 2391600925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7856959972689255, "frac_reward_zero_std": 1.0, "grad_norm": 5.645976787410528e-10, "kl": 0.072998046875, "learning_rate": 2.6756593735665413e-06, "loss": 0.0029, "num_tokens": 2392175837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7858666894256209, "frac_reward_zero_std": 1.0, "grad_norm": 7.028198547106558e-10, "kl": 0.072509765625, "learning_rate": 2.67160431954305e-06, "loss": 0.0029, "num_tokens": 2392737757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7860373815823163, "frac_reward_zero_std": 1.0, "grad_norm": 7.263437050812204e-10, "kl": 0.0679931640625, "learning_rate": 2.667551866835485e-06, "loss": 0.0027, "num_tokens": 2393297597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7862080737390117, "frac_reward_zero_std": 1.0, "grad_norm": 5.630247155047353e-10, "kl": 0.0716552734375, "learning_rate": 2.6635020168823243e-06, "loss": 0.0029, "num_tokens": 2393868925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7863787658957071, "frac_reward_zero_std": 1.0, "grad_norm": 5.407427537805365e-10, "kl": 0.07177734375, "learning_rate": 2.6594547711211126e-06, "loss": 0.0029, "num_tokens": 2394434109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7865494580524025, "frac_reward_zero_std": 1.0, "grad_norm": 7.174080978140028e-10, "kl": 0.0732421875, "learning_rate": 2.6554101309884817e-06, "loss": 0.0029, "num_tokens": 2394997741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7867201502090979, "frac_reward_zero_std": 1.0, "grad_norm": 5.897299678998892e-10, "kl": 0.0704345703125, "learning_rate": 2.6513680979201263e-06, "loss": 0.0028, "num_tokens": 2395563853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7868908423657933, "frac_reward_zero_std": 1.0, "grad_norm": 8.198595702017192e-10, "kl": 0.0728759765625, "learning_rate": 2.64732867335083e-06, "loss": 0.0029, "num_tokens": 2396126125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7870615345224887, "frac_reward_zero_std": 1.0, "grad_norm": 3.77339773971874e-10, "kl": 0.0692138671875, "learning_rate": 2.6432918587144342e-06, "loss": 0.0028, "num_tokens": 2396693837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7872322266791841, "frac_reward_zero_std": 1.0, "grad_norm": 5.933792796329732e-10, "kl": 0.0758056640625, "learning_rate": 2.63925765544387e-06, "loss": 0.003, "num_tokens": 2397257469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7874029188358795, "frac_reward_zero_std": 1.0, "grad_norm": 7.297140721994415e-10, "kl": 0.07177734375, "learning_rate": 2.635226064971127e-06, "loss": 0.0029, "num_tokens": 2397819453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.787573610992575, "frac_reward_zero_std": 1.0, "grad_norm": 3.887752717319145e-10, "kl": 0.0677490234375, "learning_rate": 2.631197088727282e-06, "loss": 0.0027, "num_tokens": 2398389597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7877443031492702, "frac_reward_zero_std": 1.0, "grad_norm": 3.976169463439161e-10, "kl": 0.0687255859375, "learning_rate": 2.6271707281424685e-06, "loss": 0.0028, "num_tokens": 2398953741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7879149953059656, "frac_reward_zero_std": 1.0, "grad_norm": 4.0372033322908853e-10, "kl": 0.071044921875, "learning_rate": 2.6231469846459055e-06, "loss": 0.0028, "num_tokens": 2399522413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.788085687462661, "frac_reward_zero_std": 1.0, "grad_norm": 6.196478753048754e-10, "kl": 0.0687255859375, "learning_rate": 2.6191258596658707e-06, "loss": 0.0028, "num_tokens": 2400086909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7882563796193565, "frac_reward_zero_std": 1.0, "grad_norm": 5.121977224050668e-10, "kl": 0.07080078125, "learning_rate": 2.6151073546297245e-06, "loss": 0.0028, "num_tokens": 2400652461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7884270717760519, "frac_reward_zero_std": 1.0, "grad_norm": 6.232755951041391e-10, "kl": 0.071044921875, "learning_rate": 2.611091470963886e-06, "loss": 0.0028, "num_tokens": 2401221005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7885977639327473, "frac_reward_zero_std": 1.0, "grad_norm": 7.907490784569909e-10, "kl": 0.0679931640625, "learning_rate": 2.607078210093853e-06, "loss": 0.0027, "num_tokens": 2401781261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7887684560894427, "frac_reward_zero_std": 1.0, "grad_norm": 7.099444098153365e-10, "kl": 0.0679931640625, "learning_rate": 2.6030675734441833e-06, "loss": 0.0027, "num_tokens": 2402341789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7889391482461381, "frac_reward_zero_std": 1.0, "grad_norm": 4.2508631042039224e-10, "kl": 0.0675048828125, "learning_rate": 2.599059562438515e-06, "loss": 0.0027, "num_tokens": 2402911821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7891098404028335, "frac_reward_zero_std": 1.0, "grad_norm": 4.770345514917327e-10, "kl": 0.06884765625, "learning_rate": 2.5950541784995397e-06, "loss": 0.0028, "num_tokens": 2403482205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7892805325595289, "frac_reward_zero_std": 1.0, "grad_norm": 4.682889023065827e-10, "kl": 0.0753173828125, "learning_rate": 2.591051423049028e-06, "loss": 0.003, "num_tokens": 2404048685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7894512247162243, "frac_reward_zero_std": 1.0, "grad_norm": 3.713586443230796e-10, "kl": 0.067138671875, "learning_rate": 2.5870512975078166e-06, "loss": 0.0027, "num_tokens": 2404618973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7896219168729197, "frac_reward_zero_std": 1.0, "grad_norm": 5.590954839745185e-10, "kl": 0.0672607421875, "learning_rate": 2.583053803295802e-06, "loss": 0.0027, "num_tokens": 2405184285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7897926090296151, "frac_reward_zero_std": 1.0, "grad_norm": 7.16022800358536e-10, "kl": 0.0716552734375, "learning_rate": 2.5790589418319478e-06, "loss": 0.0029, "num_tokens": 2405746637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7899633011863105, "frac_reward_zero_std": 1.0, "grad_norm": 5.863831744985839e-10, "kl": 0.0697021484375, "learning_rate": 2.5750667145342888e-06, "loss": 0.0028, "num_tokens": 2406314221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7901339933430059, "frac_reward_zero_std": 1.0, "grad_norm": 6.848309265011779e-10, "kl": 0.0728759765625, "learning_rate": 2.571077122819925e-06, "loss": 0.0029, "num_tokens": 2406875869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7903046854997013, "frac_reward_zero_std": 1.0, "grad_norm": 6.576767642292471e-10, "kl": 0.0703125, "learning_rate": 2.5670901681050133e-06, "loss": 0.0028, "num_tokens": 2407438989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7904753776563966, "frac_reward_zero_std": 1.0, "grad_norm": 5.259416880590281e-10, "kl": 0.0718994140625, "learning_rate": 2.563105851804777e-06, "loss": 0.0029, "num_tokens": 2408006813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.790646069813092, "frac_reward_zero_std": 1.0, "grad_norm": 3.752213021620062e-10, "kl": 0.0726318359375, "learning_rate": 2.559124175333506e-06, "loss": 0.0029, "num_tokens": 2408570813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7908167619697875, "frac_reward_zero_std": 1.0, "grad_norm": 6.761336648121031e-10, "kl": 0.0689697265625, "learning_rate": 2.555145140104556e-06, "loss": 0.0028, "num_tokens": 2409133021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7909874541264829, "frac_reward_zero_std": 1.0, "grad_norm": 4.665801184315957e-10, "kl": 0.069580078125, "learning_rate": 2.551168747530337e-06, "loss": 0.0028, "num_tokens": 2409699773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7911581462831783, "frac_reward_zero_std": 1.0, "grad_norm": 5.530743781224931e-10, "kl": 0.0716552734375, "learning_rate": 2.5471949990223232e-06, "loss": 0.0029, "num_tokens": 2410272845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7913288384398737, "frac_reward_zero_std": 1.0, "grad_norm": 7.367567334443342e-10, "kl": 0.07421875, "learning_rate": 2.543223895991054e-06, "loss": 0.003, "num_tokens": 2410834893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7914995305965691, "frac_reward_zero_std": 1.0, "grad_norm": 4.856724553972295e-10, "kl": 0.06982421875, "learning_rate": 2.539255439846129e-06, "loss": 0.0028, "num_tokens": 2411398685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7916702227532645, "frac_reward_zero_std": 1.0, "grad_norm": 4.571511235720338e-10, "kl": 0.068115234375, "learning_rate": 2.5352896319962063e-06, "loss": 0.0027, "num_tokens": 2411970189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7918409149099599, "frac_reward_zero_std": 1.0, "grad_norm": 8.646812032354084e-10, "kl": 0.076171875, "learning_rate": 2.5313264738490006e-06, "loss": 0.003, "num_tokens": 2412527869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7920116070666553, "frac_reward_zero_std": 1.0, "grad_norm": 6.551510983270676e-10, "kl": 0.0711669921875, "learning_rate": 2.527365966811294e-06, "loss": 0.0028, "num_tokens": 2413089565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7921822992233507, "frac_reward_zero_std": 1.0, "grad_norm": 5.659186057150079e-10, "kl": 0.068603515625, "learning_rate": 2.5234081122889243e-06, "loss": 0.0027, "num_tokens": 2413653421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7923529913800461, "frac_reward_zero_std": 1.0, "grad_norm": 5.181822007005167e-10, "kl": 0.0716552734375, "learning_rate": 2.519452911686786e-06, "loss": 0.0029, "num_tokens": 2414221549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7925236835367415, "frac_reward_zero_std": 1.0, "grad_norm": 5.647900577126392e-10, "kl": 0.0726318359375, "learning_rate": 2.5155003664088282e-06, "loss": 0.0029, "num_tokens": 2414790381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7926943756934369, "frac_reward_zero_std": 1.0, "grad_norm": 8.082567265221368e-10, "kl": 0.07373046875, "learning_rate": 2.5115504778580656e-06, "loss": 0.0029, "num_tokens": 2415352045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7928650678501323, "frac_reward_zero_std": 1.0, "grad_norm": 6.535491621469431e-10, "kl": 0.0712890625, "learning_rate": 2.5076032474365662e-06, "loss": 0.0029, "num_tokens": 2415914557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7930357600068277, "frac_reward_zero_std": 1.0, "grad_norm": 7.108173180878605e-10, "kl": 0.0721435546875, "learning_rate": 2.5036586765454594e-06, "loss": 0.0029, "num_tokens": 2416477069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.793206452163523, "frac_reward_zero_std": 1.0, "grad_norm": 6.584516145330903e-10, "kl": 0.06787109375, "learning_rate": 2.4997167665849152e-06, "loss": 0.0027, "num_tokens": 2417040125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7933771443202184, "frac_reward_zero_std": 1.0, "grad_norm": 4.2350931612356415e-10, "kl": 0.071044921875, "learning_rate": 2.495777518954173e-06, "loss": 0.0028, "num_tokens": 2417609597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7935478364769138, "frac_reward_zero_std": 1.0, "grad_norm": 7.491674285336609e-10, "kl": 0.074951171875, "learning_rate": 2.491840935051525e-06, "loss": 0.003, "num_tokens": 2418174509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7937185286336093, "frac_reward_zero_std": 1.0, "grad_norm": 6.063935118539219e-10, "kl": 0.0709228515625, "learning_rate": 2.4879070162743192e-06, "loss": 0.0028, "num_tokens": 2418739021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7938892207903047, "frac_reward_zero_std": 1.0, "grad_norm": 6.735587916609368e-10, "kl": 0.0732421875, "learning_rate": 2.483975764018952e-06, "loss": 0.0029, "num_tokens": 2419302589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7940599129470001, "frac_reward_zero_std": 1.0, "grad_norm": 5.066929581343474e-10, "kl": 0.0687255859375, "learning_rate": 2.4800471796808723e-06, "loss": 0.0027, "num_tokens": 2419870557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7942306051036955, "frac_reward_zero_std": 1.0, "grad_norm": 5.070813035200872e-10, "kl": 0.0689697265625, "learning_rate": 2.4761212646545895e-06, "loss": 0.0028, "num_tokens": 2420435741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7944012972603909, "frac_reward_zero_std": 1.0, "grad_norm": 6.061272215642485e-10, "kl": 0.06982421875, "learning_rate": 2.472198020333664e-06, "loss": 0.0028, "num_tokens": 2420998749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7945719894170863, "frac_reward_zero_std": 1.0, "grad_norm": 5.220407886040489e-10, "kl": 0.0740966796875, "learning_rate": 2.468277448110703e-06, "loss": 0.003, "num_tokens": 2421570813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7947426815737817, "frac_reward_zero_std": 1.0, "grad_norm": 4.808690481483278e-10, "kl": 0.071533203125, "learning_rate": 2.4643595493773652e-06, "loss": 0.0029, "num_tokens": 2422139021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7949133737304771, "frac_reward_zero_std": 1.0, "grad_norm": 3.9560155061190695e-10, "kl": 0.0697021484375, "learning_rate": 2.460444325524367e-06, "loss": 0.0028, "num_tokens": 2422708397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7950840658871725, "frac_reward_zero_std": 1.0, "grad_norm": 4.224192229844208e-10, "kl": 0.0670166015625, "learning_rate": 2.4565317779414734e-06, "loss": 0.0027, "num_tokens": 2423275165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7952547580438679, "frac_reward_zero_std": 1.0, "grad_norm": 5.857400480939624e-10, "kl": 0.0699462890625, "learning_rate": 2.452621908017495e-06, "loss": 0.0028, "num_tokens": 2423838749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7954254502005633, "frac_reward_zero_std": 1.0, "grad_norm": 1.0149048580543554e-09, "kl": 0.073486328125, "learning_rate": 2.4487147171402927e-06, "loss": 0.0029, "num_tokens": 2424396205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7955961423572587, "frac_reward_zero_std": 1.0, "grad_norm": 5.025050139632042e-10, "kl": 0.0697021484375, "learning_rate": 2.4448102066967785e-06, "loss": 0.0028, "num_tokens": 2424967117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7957668345139541, "frac_reward_zero_std": 1.0, "grad_norm": 7.54877497675126e-10, "kl": 0.0733642578125, "learning_rate": 2.440908378072918e-06, "loss": 0.0029, "num_tokens": 2425529645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7959375266706494, "frac_reward_zero_std": 1.0, "grad_norm": 4.22228354374302e-10, "kl": 0.066162109375, "learning_rate": 2.437009232653715e-06, "loss": 0.0027, "num_tokens": 2426101325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7961082188273448, "frac_reward_zero_std": 1.0, "grad_norm": 3.6341788839924606e-10, "kl": 0.0670166015625, "learning_rate": 2.433112771823224e-06, "loss": 0.0027, "num_tokens": 2426673101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7962789109840402, "frac_reward_zero_std": 1.0, "grad_norm": 4.3211502538102154e-10, "kl": 0.0716552734375, "learning_rate": 2.4292189969645506e-06, "loss": 0.0029, "num_tokens": 2427247517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7964496031407357, "frac_reward_zero_std": 1.0, "grad_norm": 7.851797767754244e-10, "kl": 0.073974609375, "learning_rate": 2.425327909459846e-06, "loss": 0.003, "num_tokens": 2427810685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7966202952974311, "frac_reward_zero_std": 1.0, "grad_norm": 5.257250831452539e-10, "kl": 0.0706787109375, "learning_rate": 2.4214395106903e-06, "loss": 0.0028, "num_tokens": 2428374909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7967909874541265, "frac_reward_zero_std": 1.0, "grad_norm": 6.174831592746756e-10, "kl": 0.0672607421875, "learning_rate": 2.417553802036161e-06, "loss": 0.0027, "num_tokens": 2428940957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7969616796108219, "frac_reward_zero_std": 1.0, "grad_norm": 6.069249131189533e-10, "kl": 0.06982421875, "learning_rate": 2.4136707848767094e-06, "loss": 0.0028, "num_tokens": 2429502749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7971323717675173, "frac_reward_zero_std": 1.0, "grad_norm": 6.697980350856374e-10, "kl": 0.074951171875, "learning_rate": 2.40979046059028e-06, "loss": 0.003, "num_tokens": 2430067085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7973030639242127, "frac_reward_zero_std": 1.0, "grad_norm": 6.395203590112931e-10, "kl": 0.068603515625, "learning_rate": 2.4059128305542444e-06, "loss": 0.0027, "num_tokens": 2430630557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7974737560809081, "frac_reward_zero_std": 1.0, "grad_norm": 6.493035215692619e-10, "kl": 0.072265625, "learning_rate": 2.4020378961450255e-06, "loss": 0.0029, "num_tokens": 2431200845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7976444482376035, "frac_reward_zero_std": 1.0, "grad_norm": 7.48326885220018e-10, "kl": 0.0716552734375, "learning_rate": 2.39816565873808e-06, "loss": 0.0029, "num_tokens": 2431768381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7978151403942989, "frac_reward_zero_std": 1.0, "grad_norm": 8.532537682363641e-10, "kl": 0.0736083984375, "learning_rate": 2.3942961197079185e-06, "loss": 0.0029, "num_tokens": 2432330589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7979858325509943, "frac_reward_zero_std": 1.0, "grad_norm": 4.743538411846968e-10, "kl": 0.0732421875, "learning_rate": 2.3904292804280802e-06, "loss": 0.0029, "num_tokens": 2432897213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7981565247076897, "frac_reward_zero_std": 1.0, "grad_norm": 6.492450059625767e-10, "kl": 0.0716552734375, "learning_rate": 2.386565142271162e-06, "loss": 0.0029, "num_tokens": 2433461693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7983272168643851, "frac_reward_zero_std": 1.0, "grad_norm": 6.657556316708571e-10, "kl": 0.07177734375, "learning_rate": 2.3827037066087856e-06, "loss": 0.0029, "num_tokens": 2434025293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7984979090210805, "frac_reward_zero_std": 1.0, "grad_norm": 5.30634210845217e-10, "kl": 0.0738525390625, "learning_rate": 2.378844974811628e-06, "loss": 0.003, "num_tokens": 2434595885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7986686011777758, "frac_reward_zero_std": 1.0, "grad_norm": 1.0698892675378785e-09, "kl": 0.0782470703125, "learning_rate": 2.3749889482493926e-06, "loss": 0.0031, "num_tokens": 2435154765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7988392933344712, "frac_reward_zero_std": 1.0, "grad_norm": 4.564380866230262e-10, "kl": 0.069580078125, "learning_rate": 2.371135628290837e-06, "loss": 0.0028, "num_tokens": 2435728333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7990099854911666, "frac_reward_zero_std": 1.0, "grad_norm": 6.28277108297744e-10, "kl": 0.0694580078125, "learning_rate": 2.3672850163037444e-06, "loss": 0.0028, "num_tokens": 2436290701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.799180677647862, "frac_reward_zero_std": 1.0, "grad_norm": 8.115782404694477e-10, "kl": 0.0728759765625, "learning_rate": 2.3634371136549483e-06, "loss": 0.0029, "num_tokens": 2436853085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7993513698045575, "frac_reward_zero_std": 1.0, "grad_norm": 5.464429191091678e-10, "kl": 0.070556640625, "learning_rate": 2.3595919217103104e-06, "loss": 0.0028, "num_tokens": 2437420397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7995220619612529, "frac_reward_zero_std": 1.0, "grad_norm": 5.149091815092389e-10, "kl": 0.0733642578125, "learning_rate": 2.3557494418347373e-06, "loss": 0.0029, "num_tokens": 2437988077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7996927541179483, "frac_reward_zero_std": 1.0, "grad_norm": 4.4868870596506707e-10, "kl": 0.0699462890625, "learning_rate": 2.3519096753921732e-06, "loss": 0.0028, "num_tokens": 2438554525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7998634462746437, "frac_reward_zero_std": 1.0, "grad_norm": 6.161638204148043e-10, "kl": 0.072021484375, "learning_rate": 2.3480726237455943e-06, "loss": 0.0029, "num_tokens": 2439122621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8000341384313391, "frac_reward_zero_std": 1.0, "grad_norm": 4.4636950496364443e-10, "kl": 0.0687255859375, "learning_rate": 2.3442382882570126e-06, "loss": 0.0028, "num_tokens": 2439690493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8002048305880345, "frac_reward_zero_std": 1.0, "grad_norm": 5.087726660298674e-10, "kl": 0.0704345703125, "learning_rate": 2.3404066702874816e-06, "loss": 0.0028, "num_tokens": 2440256013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8003755227447299, "frac_reward_zero_std": 1.0, "grad_norm": 4.698465192789623e-10, "kl": 0.0672607421875, "learning_rate": 2.3365777711970895e-06, "loss": 0.0027, "num_tokens": 2440822749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8005462149014253, "frac_reward_zero_std": 1.0, "grad_norm": 4.273405761972694e-10, "kl": 0.072509765625, "learning_rate": 2.332751592344955e-06, "loss": 0.0029, "num_tokens": 2441392653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8007169070581207, "frac_reward_zero_std": 1.0, "grad_norm": 6.755436114805029e-10, "kl": 0.0682373046875, "learning_rate": 2.328928135089231e-06, "loss": 0.0027, "num_tokens": 2441962893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8008875992148161, "frac_reward_zero_std": 1.0, "grad_norm": 4.018370136819151e-10, "kl": 0.0660400390625, "learning_rate": 2.325107400787109e-06, "loss": 0.0026, "num_tokens": 2442534685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8010582913715115, "frac_reward_zero_std": 1.0, "grad_norm": 8.215079485966478e-10, "kl": 0.07080078125, "learning_rate": 2.3212893907948154e-06, "loss": 0.0028, "num_tokens": 2443095869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8012289835282069, "frac_reward_zero_std": 1.0, "grad_norm": 6.287420567231189e-10, "kl": 0.0718994140625, "learning_rate": 2.317474106467602e-06, "loss": 0.0029, "num_tokens": 2443669277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8013996756849022, "frac_reward_zero_std": 1.0, "grad_norm": 5.107234972770559e-10, "kl": 0.0703125, "learning_rate": 2.313661549159755e-06, "loss": 0.0028, "num_tokens": 2444233853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8015703678415976, "frac_reward_zero_std": 1.0, "grad_norm": 5.34446158522559e-10, "kl": 0.07080078125, "learning_rate": 2.309851720224596e-06, "loss": 0.0028, "num_tokens": 2444799261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.801741059998293, "frac_reward_zero_std": 1.0, "grad_norm": 6.044831611436315e-10, "kl": 0.069091796875, "learning_rate": 2.3060446210144826e-06, "loss": 0.0028, "num_tokens": 2445372093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8019117521549884, "frac_reward_zero_std": 1.0, "grad_norm": 7.3568072851876e-10, "kl": 0.071533203125, "learning_rate": 2.302240252880792e-06, "loss": 0.0029, "num_tokens": 2445932397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8020824443116839, "frac_reward_zero_std": 1.0, "grad_norm": 5.004689170739821e-10, "kl": 0.0748291015625, "learning_rate": 2.2984386171739358e-06, "loss": 0.003, "num_tokens": 2446496045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8022531364683793, "frac_reward_zero_std": 1.0, "grad_norm": 6.90815246561178e-10, "kl": 0.071044921875, "learning_rate": 2.2946397152433595e-06, "loss": 0.0028, "num_tokens": 2447059997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8024238286250747, "frac_reward_zero_std": 1.0, "grad_norm": 5.057386914733414e-10, "kl": 0.06982421875, "learning_rate": 2.29084354843754e-06, "loss": 0.0028, "num_tokens": 2447626941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8025945207817701, "frac_reward_zero_std": 1.0, "grad_norm": 2.923383656868816e-10, "kl": 0.0762939453125, "learning_rate": 2.287050118103976e-06, "loss": 0.0031, "num_tokens": 2448199501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8027652129384655, "frac_reward_zero_std": 1.0, "grad_norm": 3.118188170271087e-10, "kl": 0.0699462890625, "learning_rate": 2.283259425589196e-06, "loss": 0.0028, "num_tokens": 2448771325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8029359050951609, "frac_reward_zero_std": 1.0, "grad_norm": 5.751402477586479e-10, "kl": 0.073486328125, "learning_rate": 2.2794714722387623e-06, "loss": 0.0029, "num_tokens": 2449335261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8031065972518563, "frac_reward_zero_std": 1.0, "grad_norm": 5.353916001847943e-10, "kl": 0.0699462890625, "learning_rate": 2.2756862593972596e-06, "loss": 0.0028, "num_tokens": 2449903629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8032772894085517, "frac_reward_zero_std": 1.0, "grad_norm": 5.998589928121563e-10, "kl": 0.0704345703125, "learning_rate": 2.2719037884083097e-06, "loss": 0.0028, "num_tokens": 2450469693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8034479815652471, "frac_reward_zero_std": 1.0, "grad_norm": 5.615382076887334e-10, "kl": 0.0750732421875, "learning_rate": 2.2681240606145406e-06, "loss": 0.003, "num_tokens": 2451038909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8036186737219425, "frac_reward_zero_std": 1.0, "grad_norm": 5.002160223399716e-10, "kl": 0.0673828125, "learning_rate": 2.264347077357625e-06, "loss": 0.0027, "num_tokens": 2451607245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8037893658786379, "frac_reward_zero_std": 1.0, "grad_norm": 6.377506548976351e-10, "kl": 0.071044921875, "learning_rate": 2.260572839978257e-06, "loss": 0.0028, "num_tokens": 2452180157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8039600580353333, "frac_reward_zero_std": 1.0, "grad_norm": 7.004520232037994e-10, "kl": 0.0714111328125, "learning_rate": 2.2568013498161546e-06, "loss": 0.0029, "num_tokens": 2452744413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8041307501920287, "frac_reward_zero_std": 1.0, "grad_norm": 9.183289902205812e-10, "kl": 0.074462890625, "learning_rate": 2.2530326082100608e-06, "loss": 0.003, "num_tokens": 2453304589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.804301442348724, "frac_reward_zero_std": 1.0, "grad_norm": 5.175709344953515e-10, "kl": 0.0687255859375, "learning_rate": 2.2492666164977384e-06, "loss": 0.0027, "num_tokens": 2453872109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8044721345054194, "frac_reward_zero_std": 1.0, "grad_norm": 6.335359885760298e-10, "kl": 0.075927734375, "learning_rate": 2.2455033760159817e-06, "loss": 0.003, "num_tokens": 2454448237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8046428266621148, "frac_reward_zero_std": 1.0, "grad_norm": 6.097230246527812e-10, "kl": 0.0711669921875, "learning_rate": 2.2417428881006087e-06, "loss": 0.0028, "num_tokens": 2455018237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8048135188188102, "frac_reward_zero_std": 1.0, "grad_norm": 3.8309577472084184e-10, "kl": 0.0679931640625, "learning_rate": 2.237985154086453e-06, "loss": 0.0027, "num_tokens": 2455593277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8049842109755057, "frac_reward_zero_std": 1.0, "grad_norm": 4.456224619986958e-10, "kl": 0.0716552734375, "learning_rate": 2.2342301753073726e-06, "loss": 0.0029, "num_tokens": 2456160925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8051549031322011, "frac_reward_zero_std": 1.0, "grad_norm": 6.265548121164037e-10, "kl": 0.073486328125, "learning_rate": 2.2304779530962505e-06, "loss": 0.0029, "num_tokens": 2456732669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8053255952888965, "frac_reward_zero_std": 1.0, "grad_norm": 4.697062083074163e-10, "kl": 0.071044921875, "learning_rate": 2.2267284887849948e-06, "loss": 0.0028, "num_tokens": 2457296637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8054962874455919, "frac_reward_zero_std": 1.0, "grad_norm": 7.555458123831987e-10, "kl": 0.0706787109375, "learning_rate": 2.2229817837045275e-06, "loss": 0.0028, "num_tokens": 2457858141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8056669796022873, "frac_reward_zero_std": 1.0, "grad_norm": 5.977516209250894e-10, "kl": 0.07421875, "learning_rate": 2.219237839184789e-06, "loss": 0.003, "num_tokens": 2458420781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8058376717589827, "frac_reward_zero_std": 1.0, "grad_norm": 9.048105294790158e-10, "kl": 0.0732421875, "learning_rate": 2.2154966565547485e-06, "loss": 0.0029, "num_tokens": 2458983437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8060083639156781, "frac_reward_zero_std": 1.0, "grad_norm": 6.720537910036038e-10, "kl": 0.06982421875, "learning_rate": 2.2117582371423928e-06, "loss": 0.0028, "num_tokens": 2459546109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8061790560723735, "frac_reward_zero_std": 1.0, "grad_norm": 4.937035106284702e-10, "kl": 0.0665283203125, "learning_rate": 2.208022582274725e-06, "loss": 0.0027, "num_tokens": 2460111133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8063497482290689, "frac_reward_zero_std": 1.0, "grad_norm": 5.151420590360883e-10, "kl": 0.067626953125, "learning_rate": 2.2042896932777624e-06, "loss": 0.0027, "num_tokens": 2460682205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8065204403857643, "frac_reward_zero_std": 1.0, "grad_norm": 4.76971413776151e-10, "kl": 0.0692138671875, "learning_rate": 2.2005595714765494e-06, "loss": 0.0028, "num_tokens": 2461247901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8066911325424597, "frac_reward_zero_std": 1.0, "grad_norm": 5.087544041019049e-10, "kl": 0.069091796875, "learning_rate": 2.1968322181951484e-06, "loss": 0.0028, "num_tokens": 2461814493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8068618246991551, "frac_reward_zero_std": 1.0, "grad_norm": 6.246007681898557e-10, "kl": 0.0748291015625, "learning_rate": 2.1931076347566285e-06, "loss": 0.003, "num_tokens": 2462376845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8070325168558504, "frac_reward_zero_std": 1.0, "grad_norm": 6.129728195234671e-10, "kl": 0.0706787109375, "learning_rate": 2.189385822483089e-06, "loss": 0.0028, "num_tokens": 2462938877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8072032090125458, "frac_reward_zero_std": 1.0, "grad_norm": 8.319105427426301e-10, "kl": 0.075439453125, "learning_rate": 2.1856667826956314e-06, "loss": 0.003, "num_tokens": 2463502461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8073739011692412, "frac_reward_zero_std": 1.0, "grad_norm": 4.850197278294365e-10, "kl": 0.0728759765625, "learning_rate": 2.1819505167143885e-06, "loss": 0.0029, "num_tokens": 2464071981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8075445933259366, "frac_reward_zero_std": 1.0, "grad_norm": 5.335695940703637e-10, "kl": 0.0679931640625, "learning_rate": 2.178237025858494e-06, "loss": 0.0027, "num_tokens": 2464640877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.807715285482632, "frac_reward_zero_std": 1.0, "grad_norm": 5.505679236752397e-10, "kl": 0.072998046875, "learning_rate": 2.174526311446109e-06, "loss": 0.0029, "num_tokens": 2465207997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8078859776393275, "frac_reward_zero_std": 1.0, "grad_norm": 4.2634744922793913e-10, "kl": 0.0709228515625, "learning_rate": 2.1708183747943966e-06, "loss": 0.0028, "num_tokens": 2465776093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8080566697960229, "frac_reward_zero_std": 1.0, "grad_norm": 5.093314861256442e-10, "kl": 0.0709228515625, "learning_rate": 2.1671132172195462e-06, "loss": 0.0028, "num_tokens": 2466349069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8082273619527183, "frac_reward_zero_std": 1.0, "grad_norm": 5.204726767359984e-10, "kl": 0.0709228515625, "learning_rate": 2.1634108400367513e-06, "loss": 0.0028, "num_tokens": 2466913325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8083980541094137, "frac_reward_zero_std": 1.0, "grad_norm": 4.2387676785995665e-10, "kl": 0.072021484375, "learning_rate": 2.1597112445602254e-06, "loss": 0.0029, "num_tokens": 2467486477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8085687462661091, "frac_reward_zero_std": 1.0, "grad_norm": 6.539004280435e-10, "kl": 0.07568359375, "learning_rate": 2.1560144321031873e-06, "loss": 0.003, "num_tokens": 2468051501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8087394384228045, "frac_reward_zero_std": 1.0, "grad_norm": 7.671070471954664e-10, "kl": 0.071533203125, "learning_rate": 2.152320403977877e-06, "loss": 0.0029, "num_tokens": 2468616285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8089101305794999, "frac_reward_zero_std": 1.0, "grad_norm": 4.184953511625626e-10, "kl": 0.064697265625, "learning_rate": 2.1486291614955357e-06, "loss": 0.0026, "num_tokens": 2469183501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8090808227361953, "frac_reward_zero_std": 1.0, "grad_norm": 7.595928841516533e-10, "kl": 0.070068359375, "learning_rate": 2.144940705966425e-06, "loss": 0.0028, "num_tokens": 2469743869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8092515148928907, "frac_reward_zero_std": 1.0, "grad_norm": 5.410913165249548e-10, "kl": 0.0706787109375, "learning_rate": 2.141255038699811e-06, "loss": 0.0028, "num_tokens": 2470316109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8094222070495861, "frac_reward_zero_std": 1.0, "grad_norm": 4.945741229374593e-10, "kl": 0.0693359375, "learning_rate": 2.1375721610039766e-06, "loss": 0.0028, "num_tokens": 2470893341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8095928992062815, "frac_reward_zero_std": 1.0, "grad_norm": 5.481771852853818e-10, "kl": 0.0689697265625, "learning_rate": 2.1338920741862044e-06, "loss": 0.0028, "num_tokens": 2471462893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8097635913629768, "frac_reward_zero_std": 1.0, "grad_norm": 5.539438376405222e-10, "kl": 0.072021484375, "learning_rate": 2.1302147795527994e-06, "loss": 0.0029, "num_tokens": 2472028861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8099342835196722, "frac_reward_zero_std": 1.0, "grad_norm": 6.981036945710958e-10, "kl": 0.0740966796875, "learning_rate": 2.1265402784090615e-06, "loss": 0.003, "num_tokens": 2472593757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8101049756763676, "frac_reward_zero_std": 1.0, "grad_norm": 5.048423754197387e-10, "kl": 0.0672607421875, "learning_rate": 2.1228685720593113e-06, "loss": 0.0027, "num_tokens": 2473165165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.810275667833063, "frac_reward_zero_std": 1.0, "grad_norm": 5.104129618875405e-10, "kl": 0.070556640625, "learning_rate": 2.119199661806868e-06, "loss": 0.0028, "num_tokens": 2473733901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8104463599897584, "frac_reward_zero_std": 1.0, "grad_norm": 4.392543041897248e-10, "kl": 0.0679931640625, "learning_rate": 2.115533548954064e-06, "loss": 0.0027, "num_tokens": 2474299517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8106170521464539, "frac_reward_zero_std": 1.0, "grad_norm": 5.363741009326762e-10, "kl": 0.0726318359375, "learning_rate": 2.111870234802239e-06, "loss": 0.0029, "num_tokens": 2474865533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8107877443031493, "frac_reward_zero_std": 1.0, "grad_norm": 6.620047800018993e-10, "kl": 0.0701904296875, "learning_rate": 2.1082097206517373e-06, "loss": 0.0028, "num_tokens": 2475430109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8109584364598447, "frac_reward_zero_std": 1.0, "grad_norm": 0.009401889508648323, "kl": 0.09521484375, "learning_rate": 2.1045520078019033e-06, "loss": 0.0038, "num_tokens": 2476021837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8111291286165401, "frac_reward_zero_std": 1.0, "grad_norm": 3.982072763691321e-10, "kl": 0.0670166015625, "learning_rate": 2.100897097551098e-06, "loss": 0.0027, "num_tokens": 2476588189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8112998207732355, "frac_reward_zero_std": 1.0, "grad_norm": 5.077418373947268e-10, "kl": 0.0687255859375, "learning_rate": 2.0972449911966843e-06, "loss": 0.0027, "num_tokens": 2477157117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8114705129299309, "frac_reward_zero_std": 1.0, "grad_norm": 6.757777103444614e-10, "kl": 0.0703125, "learning_rate": 2.0935956900350252e-06, "loss": 0.0028, "num_tokens": 2477723101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8116412050866263, "frac_reward_zero_std": 1.0, "grad_norm": 4.886785340162737e-10, "kl": 0.0706787109375, "learning_rate": 2.089949195361489e-06, "loss": 0.0028, "num_tokens": 2478290173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8118118972433217, "frac_reward_zero_std": 1.0, "grad_norm": 6.999281052648675e-10, "kl": 0.072265625, "learning_rate": 2.086305508470452e-06, "loss": 0.0029, "num_tokens": 2478851645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8119825894000171, "frac_reward_zero_std": 1.0, "grad_norm": 6.933515470827663e-10, "kl": 0.0716552734375, "learning_rate": 2.0826646306552945e-06, "loss": 0.0029, "num_tokens": 2479412061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8121532815567125, "frac_reward_zero_std": 1.0, "grad_norm": 4.464456621472768e-10, "kl": 0.0709228515625, "learning_rate": 2.079026563208394e-06, "loss": 0.0028, "num_tokens": 2479982093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8123239737134079, "frac_reward_zero_std": 1.0, "grad_norm": 6.192022876586456e-10, "kl": 0.070068359375, "learning_rate": 2.0753913074211287e-06, "loss": 0.0028, "num_tokens": 2480544797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8124946658701032, "frac_reward_zero_std": 1.0, "grad_norm": 6.39906305731771e-10, "kl": 0.0748291015625, "learning_rate": 2.0717588645838883e-06, "loss": 0.003, "num_tokens": 2481109949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8126653580267986, "frac_reward_zero_std": 1.0, "grad_norm": 4.627433522534644e-10, "kl": 0.070068359375, "learning_rate": 2.0681292359860607e-06, "loss": 0.0028, "num_tokens": 2481675373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.812836050183494, "frac_reward_zero_std": 1.0, "grad_norm": 6.889608954822224e-10, "kl": 0.0716552734375, "learning_rate": 2.0645024229160305e-06, "loss": 0.0029, "num_tokens": 2482240557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8130067423401894, "frac_reward_zero_std": 1.0, "grad_norm": 5.613288632452799e-10, "kl": 0.0667724609375, "learning_rate": 2.0608784266611816e-06, "loss": 0.0027, "num_tokens": 2482808509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8131774344968848, "frac_reward_zero_std": 1.0, "grad_norm": 4.406086729765797e-10, "kl": 0.06884765625, "learning_rate": 2.0572572485079056e-06, "loss": 0.0028, "num_tokens": 2483378269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8133481266535803, "frac_reward_zero_std": 1.0, "grad_norm": 7.118592960711197e-10, "kl": 0.0726318359375, "learning_rate": 2.0536388897415905e-06, "loss": 0.0029, "num_tokens": 2483938557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8135188188102757, "frac_reward_zero_std": 1.0, "grad_norm": 5.150565001599994e-10, "kl": 0.0733642578125, "learning_rate": 2.0500233516466275e-06, "loss": 0.0029, "num_tokens": 2484505037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8136895109669711, "frac_reward_zero_std": 1.0, "grad_norm": 7.245876220816714e-10, "kl": 0.0673828125, "learning_rate": 2.0464106355063916e-06, "loss": 0.0027, "num_tokens": 2485068013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8138602031236665, "frac_reward_zero_std": 1.0, "grad_norm": 5.739712505063092e-10, "kl": 0.0716552734375, "learning_rate": 2.0428007426032714e-06, "loss": 0.0029, "num_tokens": 2485636845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8140308952803619, "frac_reward_zero_std": 1.0, "grad_norm": 5.218517838116629e-10, "kl": 0.0689697265625, "learning_rate": 2.03919367421865e-06, "loss": 0.0028, "num_tokens": 2486202813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8142015874370573, "frac_reward_zero_std": 1.0, "grad_norm": 2.5777235937170767e-10, "kl": 0.0693359375, "learning_rate": 2.0355894316329085e-06, "loss": 0.0028, "num_tokens": 2486776925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8143722795937527, "frac_reward_zero_std": 1.0, "grad_norm": 5.041600237463004e-10, "kl": 0.0701904296875, "learning_rate": 2.0319880161254192e-06, "loss": 0.0028, "num_tokens": 2487343981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8145429717504481, "frac_reward_zero_std": 1.0, "grad_norm": 3.4427556411651503e-10, "kl": 0.0684814453125, "learning_rate": 2.028389428974553e-06, "loss": 0.0027, "num_tokens": 2487912685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8147136639071435, "frac_reward_zero_std": 1.0, "grad_norm": 6.29727786019787e-10, "kl": 0.0753173828125, "learning_rate": 2.0247936714576812e-06, "loss": 0.003, "num_tokens": 2488474205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8148843560638389, "frac_reward_zero_std": 1.0, "grad_norm": 2.9280899653328785e-10, "kl": 0.0673828125, "learning_rate": 2.0212007448511694e-06, "loss": 0.0027, "num_tokens": 2489046733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8150550482205343, "frac_reward_zero_std": 1.0, "grad_norm": 3.847826729012218e-10, "kl": 0.070068359375, "learning_rate": 2.017610650430376e-06, "loss": 0.0028, "num_tokens": 2489619117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8152257403772296, "frac_reward_zero_std": 1.0, "grad_norm": 7.346884813376898e-10, "kl": 0.072265625, "learning_rate": 2.0140233894696515e-06, "loss": 0.0029, "num_tokens": 2490187997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.815396432533925, "frac_reward_zero_std": 1.0, "grad_norm": 5.555865886380075e-10, "kl": 0.069091796875, "learning_rate": 2.010438963242346e-06, "loss": 0.0028, "num_tokens": 2490750413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8155671246906204, "frac_reward_zero_std": 1.0, "grad_norm": 4.252298850293697e-10, "kl": 0.0697021484375, "learning_rate": 2.006857373020804e-06, "loss": 0.0028, "num_tokens": 2491319645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8157378168473158, "frac_reward_zero_std": 1.0, "grad_norm": 5.951350437464206e-10, "kl": 0.07080078125, "learning_rate": 2.00327862007636e-06, "loss": 0.0028, "num_tokens": 2491883117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8159085090040112, "frac_reward_zero_std": 1.0, "grad_norm": 3.999125506240783e-10, "kl": 0.073974609375, "learning_rate": 1.9997027056793374e-06, "loss": 0.003, "num_tokens": 2492451501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8160792011607066, "frac_reward_zero_std": 1.0, "grad_norm": 4.027311486132906e-10, "kl": 0.07177734375, "learning_rate": 1.9961296310990608e-06, "loss": 0.0029, "num_tokens": 2493021949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.816249893317402, "frac_reward_zero_std": 1.0, "grad_norm": 6.116396205422858e-10, "kl": 0.0704345703125, "learning_rate": 1.9925593976038436e-06, "loss": 0.0028, "num_tokens": 2493585421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8164205854740975, "frac_reward_zero_std": 1.0, "grad_norm": 6.582612313862909e-10, "kl": 0.0731201171875, "learning_rate": 1.988992006460989e-06, "loss": 0.0029, "num_tokens": 2494150349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8165912776307929, "frac_reward_zero_std": 1.0, "grad_norm": 8.091832236429875e-10, "kl": 0.0750732421875, "learning_rate": 1.9854274589367883e-06, "loss": 0.003, "num_tokens": 2494715277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8167619697874883, "frac_reward_zero_std": 1.0, "grad_norm": 6.569017553468175e-10, "kl": 0.0726318359375, "learning_rate": 1.9818657562965305e-06, "loss": 0.0029, "num_tokens": 2495280205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8169326619441837, "frac_reward_zero_std": 1.0, "grad_norm": 5.832516663378513e-10, "kl": 0.068359375, "learning_rate": 1.978306899804494e-06, "loss": 0.0027, "num_tokens": 2495844029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8171033541008791, "frac_reward_zero_std": 1.0, "grad_norm": 6.500570764862015e-10, "kl": 0.072021484375, "learning_rate": 1.974750890723941e-06, "loss": 0.0029, "num_tokens": 2496404573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8172740462575745, "frac_reward_zero_std": 1.0, "grad_norm": 7.537933123906654e-10, "kl": 0.0745849609375, "learning_rate": 1.9711977303171248e-06, "loss": 0.003, "num_tokens": 2496970973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8174447384142699, "frac_reward_zero_std": 1.0, "grad_norm": 3.8817912025451847e-10, "kl": 0.0699462890625, "learning_rate": 1.967647419845291e-06, "loss": 0.0028, "num_tokens": 2497540285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8176154305709653, "frac_reward_zero_std": 1.0, "grad_norm": 3.934893247079506e-10, "kl": 0.06982421875, "learning_rate": 1.964099960568675e-06, "loss": 0.0028, "num_tokens": 2498109709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8177861227276607, "frac_reward_zero_std": 1.0, "grad_norm": 7.148419164094818e-10, "kl": 0.0750732421875, "learning_rate": 1.9605553537464906e-06, "loss": 0.003, "num_tokens": 2498676157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.817956814884356, "frac_reward_zero_std": 1.0, "grad_norm": 4.358738235625458e-10, "kl": 0.066650390625, "learning_rate": 1.9570136006369513e-06, "loss": 0.0027, "num_tokens": 2499241725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8181275070410514, "frac_reward_zero_std": 1.0, "grad_norm": 4.773023747676384e-10, "kl": 0.0784912109375, "learning_rate": 1.9534747024972455e-06, "loss": 0.0031, "num_tokens": 2499806413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8182981991977468, "frac_reward_zero_std": 1.0, "grad_norm": 6.152840195993737e-10, "kl": 0.06884765625, "learning_rate": 1.94993866058356e-06, "loss": 0.0028, "num_tokens": 2500371277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8184688913544422, "frac_reward_zero_std": 1.0, "grad_norm": 4.032686588049366e-10, "kl": 0.0711669921875, "learning_rate": 1.9464054761510574e-06, "loss": 0.0029, "num_tokens": 2500940845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8186395835111376, "frac_reward_zero_std": 1.0, "grad_norm": 4.54645442827323e-10, "kl": 0.0693359375, "learning_rate": 1.9428751504538957e-06, "loss": 0.0028, "num_tokens": 2501505933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.818810275667833, "frac_reward_zero_std": 1.0, "grad_norm": 4.5030422804829e-10, "kl": 0.0714111328125, "learning_rate": 1.9393476847452064e-06, "loss": 0.0029, "num_tokens": 2502073213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8189809678245284, "frac_reward_zero_std": 1.0, "grad_norm": 7.100808628599305e-10, "kl": 0.0721435546875, "learning_rate": 1.9358230802771196e-06, "loss": 0.0029, "num_tokens": 2502635453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8191516599812239, "frac_reward_zero_std": 1.0, "grad_norm": 5.093749841497805e-10, "kl": 0.0673828125, "learning_rate": 1.932301338300736e-06, "loss": 0.0027, "num_tokens": 2503199373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8193223521379193, "frac_reward_zero_std": 1.0, "grad_norm": 3.839970653333436e-10, "kl": 0.06982421875, "learning_rate": 1.928782460066152e-06, "loss": 0.0028, "num_tokens": 2503770445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8194930442946147, "frac_reward_zero_std": 1.0, "grad_norm": 6.054991271386403e-10, "kl": 0.0654296875, "learning_rate": 1.925266446822439e-06, "loss": 0.0026, "num_tokens": 2504333725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8196637364513101, "frac_reward_zero_std": 1.0, "grad_norm": 6.773782808433034e-10, "kl": 0.0706787109375, "learning_rate": 1.921753299817659e-06, "loss": 0.0028, "num_tokens": 2504896317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8198344286080055, "frac_reward_zero_std": 1.0, "grad_norm": 5.996391859248768e-10, "kl": 0.06982421875, "learning_rate": 1.9182430202988467e-06, "loss": 0.0028, "num_tokens": 2505462205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8200051207647009, "frac_reward_zero_std": 1.0, "grad_norm": 5.643854907643955e-10, "kl": 0.0682373046875, "learning_rate": 1.9147356095120297e-06, "loss": 0.0027, "num_tokens": 2506024477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8201758129213963, "frac_reward_zero_std": 1.0, "grad_norm": 4.392823929464156e-10, "kl": 0.070556640625, "learning_rate": 1.9112310687022086e-06, "loss": 0.0028, "num_tokens": 2506586813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8203465050780917, "frac_reward_zero_std": 1.0, "grad_norm": 5.572827576798539e-10, "kl": 0.0771484375, "learning_rate": 1.9077293991133715e-06, "loss": 0.0031, "num_tokens": 2507151581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8205171972347871, "frac_reward_zero_std": 1.0, "grad_norm": 6.268497458765499e-10, "kl": 0.0709228515625, "learning_rate": 1.9042306019884816e-06, "loss": 0.0028, "num_tokens": 2507715741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8206878893914825, "frac_reward_zero_std": 1.0, "grad_norm": 7.09734196148631e-10, "kl": 0.074462890625, "learning_rate": 1.9007346785694868e-06, "loss": 0.003, "num_tokens": 2508279645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8208585815481778, "frac_reward_zero_std": 1.0, "grad_norm": 6.3041808984065e-10, "kl": 0.067138671875, "learning_rate": 1.8972416300973207e-06, "loss": 0.0027, "num_tokens": 2508841741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8210292737048732, "frac_reward_zero_std": 1.0, "grad_norm": 4.1030833426859204e-10, "kl": 0.0701904296875, "learning_rate": 1.8937514578118777e-06, "loss": 0.0028, "num_tokens": 2509409949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8211999658615686, "frac_reward_zero_std": 1.0, "grad_norm": 4.451915071289755e-10, "kl": 0.067138671875, "learning_rate": 1.890264162952048e-06, "loss": 0.0027, "num_tokens": 2509991277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.821370658018264, "frac_reward_zero_std": 1.0, "grad_norm": 6.623404925249713e-10, "kl": 0.0743408203125, "learning_rate": 1.8867797467556958e-06, "loss": 0.003, "num_tokens": 2510552189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8215413501749594, "frac_reward_zero_std": 1.0, "grad_norm": 6.11418911824996e-10, "kl": 0.0723876953125, "learning_rate": 1.8832982104596665e-06, "loss": 0.0029, "num_tokens": 2511114525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8217120423316548, "frac_reward_zero_std": 1.0, "grad_norm": 5.406741133616564e-10, "kl": 0.0692138671875, "learning_rate": 1.8798195552997756e-06, "loss": 0.0028, "num_tokens": 2511679661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8218827344883503, "frac_reward_zero_std": 1.0, "grad_norm": 6.928609719461925e-10, "kl": 0.0712890625, "learning_rate": 1.8763437825108178e-06, "loss": 0.0028, "num_tokens": 2512240813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8220534266450457, "frac_reward_zero_std": 1.0, "grad_norm": 7.568787486003823e-10, "kl": 0.069580078125, "learning_rate": 1.8728708933265715e-06, "loss": 0.0028, "num_tokens": 2512801117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8222241188017411, "frac_reward_zero_std": 1.0, "grad_norm": 6.284355380626445e-10, "kl": 0.07470703125, "learning_rate": 1.8694008889797866e-06, "loss": 0.003, "num_tokens": 2513365229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8223948109584365, "frac_reward_zero_std": 1.0, "grad_norm": 5.939543258161647e-10, "kl": 0.0677490234375, "learning_rate": 1.8659337707021896e-06, "loss": 0.0027, "num_tokens": 2513931117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8225655031151319, "frac_reward_zero_std": 1.0, "grad_norm": 7.207251318174732e-10, "kl": 0.0726318359375, "learning_rate": 1.8624695397244773e-06, "loss": 0.0029, "num_tokens": 2514492141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8227361952718273, "frac_reward_zero_std": 1.0, "grad_norm": 5.125369282687867e-10, "kl": 0.0706787109375, "learning_rate": 1.8590081972763319e-06, "loss": 0.0028, "num_tokens": 2515058797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8229068874285227, "frac_reward_zero_std": 1.0, "grad_norm": 6.380282639271197e-10, "kl": 0.0718994140625, "learning_rate": 1.8555497445864046e-06, "loss": 0.0029, "num_tokens": 2515626317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8230775795852181, "frac_reward_zero_std": 1.0, "grad_norm": 6.948267922167869e-10, "kl": 0.07177734375, "learning_rate": 1.8520941828823213e-06, "loss": 0.0029, "num_tokens": 2516188237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8232482717419135, "frac_reward_zero_std": 1.0, "grad_norm": 5.761204552090566e-10, "kl": 0.070556640625, "learning_rate": 1.8486415133906777e-06, "loss": 0.0028, "num_tokens": 2516750893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8234189638986089, "frac_reward_zero_std": 1.0, "grad_norm": 6.108606290060805e-10, "kl": 0.0751953125, "learning_rate": 1.8451917373370498e-06, "loss": 0.003, "num_tokens": 2517312589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8235896560553042, "frac_reward_zero_std": 1.0, "grad_norm": 5.894834445270189e-10, "kl": 0.068359375, "learning_rate": 1.8417448559459849e-06, "loss": 0.0027, "num_tokens": 2517882957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8237603482119996, "frac_reward_zero_std": 1.0, "grad_norm": 6.648312934888059e-10, "kl": 0.0733642578125, "learning_rate": 1.838300870441001e-06, "loss": 0.0029, "num_tokens": 2518449261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.823931040368695, "frac_reward_zero_std": 1.0, "grad_norm": 4.814512137057138e-10, "kl": 0.0760498046875, "learning_rate": 1.8348597820445857e-06, "loss": 0.003, "num_tokens": 2519017229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8241017325253904, "frac_reward_zero_std": 1.0, "grad_norm": 4.246951421666684e-10, "kl": 0.0726318359375, "learning_rate": 1.831421591978203e-06, "loss": 0.0029, "num_tokens": 2519586173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8242724246820858, "frac_reward_zero_std": 1.0, "grad_norm": 4.471203798779964e-10, "kl": 0.066650390625, "learning_rate": 1.8279863014622878e-06, "loss": 0.0027, "num_tokens": 2520153917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8244431168387812, "frac_reward_zero_std": 1.0, "grad_norm": 6.451020186669046e-10, "kl": 0.072509765625, "learning_rate": 1.8245539117162414e-06, "loss": 0.0029, "num_tokens": 2520717469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8246138089954766, "frac_reward_zero_std": 1.0, "grad_norm": 4.599182062932814e-10, "kl": 0.0701904296875, "learning_rate": 1.8211244239584413e-06, "loss": 0.0028, "num_tokens": 2521284445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8247845011521721, "frac_reward_zero_std": 1.0, "grad_norm": 5.687897392996925e-10, "kl": 0.068359375, "learning_rate": 1.8176978394062284e-06, "loss": 0.0027, "num_tokens": 2521848733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8249551933088675, "frac_reward_zero_std": 1.0, "grad_norm": 6.258552735498888e-10, "kl": 0.0693359375, "learning_rate": 1.8142741592759205e-06, "loss": 0.0028, "num_tokens": 2522417069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8251258854655629, "frac_reward_zero_std": 1.0, "grad_norm": 6.398935280360394e-10, "kl": 0.07275390625, "learning_rate": 1.8108533847827959e-06, "loss": 0.0029, "num_tokens": 2522980669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8252965776222583, "frac_reward_zero_std": 1.0, "grad_norm": 4.877523481998297e-10, "kl": 0.0706787109375, "learning_rate": 1.8074355171411117e-06, "loss": 0.0028, "num_tokens": 2523545069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8254672697789537, "frac_reward_zero_std": 1.0, "grad_norm": 4.5979724653911727e-10, "kl": 0.0693359375, "learning_rate": 1.8040205575640813e-06, "loss": 0.0028, "num_tokens": 2524111677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8256379619356491, "frac_reward_zero_std": 1.0, "grad_norm": 6.518243770389118e-10, "kl": 0.068359375, "learning_rate": 1.8006085072638967e-06, "loss": 0.0027, "num_tokens": 2524676413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8258086540923445, "frac_reward_zero_std": 1.0, "grad_norm": 5.642896761107687e-10, "kl": 0.070068359375, "learning_rate": 1.7971993674517096e-06, "loss": 0.0028, "num_tokens": 2525242205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8259793462490399, "frac_reward_zero_std": 1.0, "grad_norm": 4.754455713339797e-10, "kl": 0.0694580078125, "learning_rate": 1.793793139337645e-06, "loss": 0.0028, "num_tokens": 2525807277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8261500384057353, "frac_reward_zero_std": 1.0, "grad_norm": 5.781916250863936e-10, "kl": 0.07177734375, "learning_rate": 1.7903898241307872e-06, "loss": 0.0029, "num_tokens": 2526374909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8263207305624306, "frac_reward_zero_std": 1.0, "grad_norm": 5.936649663440446e-10, "kl": 0.067138671875, "learning_rate": 1.7869894230391938e-06, "loss": 0.0027, "num_tokens": 2526942253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.826491422719126, "frac_reward_zero_std": 1.0, "grad_norm": 8.05326666075199e-10, "kl": 0.0740966796875, "learning_rate": 1.7835919372698796e-06, "loss": 0.003, "num_tokens": 2527504973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8266621148758214, "frac_reward_zero_std": 1.0, "grad_norm": 3.5009181572454864e-10, "kl": 0.0672607421875, "learning_rate": 1.7801973680288354e-06, "loss": 0.0027, "num_tokens": 2528070461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8268328070325168, "frac_reward_zero_std": 1.0, "grad_norm": 5.243155739564816e-10, "kl": 0.0711669921875, "learning_rate": 1.7768057165210052e-06, "loss": 0.0029, "num_tokens": 2528642029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8270034991892122, "frac_reward_zero_std": 1.0, "grad_norm": 6.724930284442357e-10, "kl": 0.070068359375, "learning_rate": 1.773416983950308e-06, "loss": 0.0028, "num_tokens": 2529206797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8271741913459076, "frac_reward_zero_std": 1.0, "grad_norm": 6.361545940105032e-10, "kl": 0.0743408203125, "learning_rate": 1.770031171519616e-06, "loss": 0.003, "num_tokens": 2529773549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.827344883502603, "frac_reward_zero_std": 1.0, "grad_norm": 5.28437606782144e-10, "kl": 0.0667724609375, "learning_rate": 1.7666482804307761e-06, "loss": 0.0027, "num_tokens": 2530340029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8275155756592985, "frac_reward_zero_std": 1.0, "grad_norm": 4.6116046755808974e-10, "kl": 0.0682373046875, "learning_rate": 1.7632683118845873e-06, "loss": 0.0027, "num_tokens": 2530906621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8276862678159939, "frac_reward_zero_std": 1.0, "grad_norm": 6.152753622181058e-10, "kl": 0.072265625, "learning_rate": 1.759891267080821e-06, "loss": 0.0029, "num_tokens": 2531471837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8278569599726893, "frac_reward_zero_std": 1.0, "grad_norm": 5.293940663650393e-10, "kl": 0.0675048828125, "learning_rate": 1.7565171472182009e-06, "loss": 0.0027, "num_tokens": 2532037789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8280276521293847, "frac_reward_zero_std": 1.0, "grad_norm": 6.812859424087441e-10, "kl": 0.078369140625, "learning_rate": 1.7531459534944217e-06, "loss": 0.0031, "num_tokens": 2532602941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8281983442860801, "frac_reward_zero_std": 1.0, "grad_norm": 4.156273468518916e-10, "kl": 0.06982421875, "learning_rate": 1.749777687106138e-06, "loss": 0.0028, "num_tokens": 2533170317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8283690364427755, "frac_reward_zero_std": 1.0, "grad_norm": 4.772855324838775e-10, "kl": 0.0723876953125, "learning_rate": 1.7464123492489582e-06, "loss": 0.0029, "num_tokens": 2533736989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8285397285994709, "frac_reward_zero_std": 1.0, "grad_norm": 4.890652863612e-10, "kl": 0.070556640625, "learning_rate": 1.743049941117455e-06, "loss": 0.0028, "num_tokens": 2534302237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8287104207561663, "frac_reward_zero_std": 1.0, "grad_norm": 5.623547136039423e-10, "kl": 0.070068359375, "learning_rate": 1.7396904639051636e-06, "loss": 0.0028, "num_tokens": 2534866429.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8288811129128617, "frac_reward_zero_std": 1.0, "grad_norm": 5.493072853884314e-10, "kl": 0.0723876953125, "learning_rate": 1.7363339188045803e-06, "loss": 0.0029, "num_tokens": 2535431069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.829051805069557, "frac_reward_zero_std": 1.0, "grad_norm": 5.129881537407932e-10, "kl": 0.072021484375, "learning_rate": 1.7329803070071549e-06, "loss": 0.0029, "num_tokens": 2535999453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8292224972262524, "frac_reward_zero_std": 1.0, "grad_norm": 8.382099703667794e-10, "kl": 0.07080078125, "learning_rate": 1.7296296297032944e-06, "loss": 0.0028, "num_tokens": 2536561901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8293931893829478, "frac_reward_zero_std": 1.0, "grad_norm": 5.430381666793445e-10, "kl": 0.068359375, "learning_rate": 1.726281888082374e-06, "loss": 0.0027, "num_tokens": 2537127725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8295638815396432, "frac_reward_zero_std": 1.0, "grad_norm": 8.228724230258661e-10, "kl": 0.0711669921875, "learning_rate": 1.722937083332721e-06, "loss": 0.0028, "num_tokens": 2537690797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8297345736963386, "frac_reward_zero_std": 1.0, "grad_norm": 4.340344011028753e-10, "kl": 0.071044921875, "learning_rate": 1.7195952166416175e-06, "loss": 0.0028, "num_tokens": 2538259261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.829905265853034, "frac_reward_zero_std": 1.0, "grad_norm": 5.83123066235978e-10, "kl": 0.0699462890625, "learning_rate": 1.716256289195305e-06, "loss": 0.0028, "num_tokens": 2538823277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8300759580097294, "frac_reward_zero_std": 1.0, "grad_norm": 6.305713336707523e-10, "kl": 0.0694580078125, "learning_rate": 1.712920302178984e-06, "loss": 0.0028, "num_tokens": 2539392077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8302466501664248, "frac_reward_zero_std": 1.0, "grad_norm": 6.487341292317743e-10, "kl": 0.070068359375, "learning_rate": 1.7095872567768123e-06, "loss": 0.0028, "num_tokens": 2539954189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8304173423231203, "frac_reward_zero_std": 1.0, "grad_norm": 3.799537987826317e-10, "kl": 0.0657958984375, "learning_rate": 1.7062571541718976e-06, "loss": 0.0026, "num_tokens": 2540517773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8305880344798157, "frac_reward_zero_std": 1.0, "grad_norm": 0.00860288399867303, "kl": 0.0948486328125, "learning_rate": 1.7029299955463031e-06, "loss": 0.0038, "num_tokens": 2541113165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8307587266365111, "frac_reward_zero_std": 1.0, "grad_norm": 5.387583245974598e-10, "kl": 0.068603515625, "learning_rate": 1.699605782081053e-06, "loss": 0.0028, "num_tokens": 2541680685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8309294187932065, "frac_reward_zero_std": 1.0, "grad_norm": 4.514044653558521e-10, "kl": 0.06982421875, "learning_rate": 1.6962845149561235e-06, "loss": 0.0028, "num_tokens": 2542247981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8311001109499019, "frac_reward_zero_std": 1.0, "grad_norm": 6.124792495549987e-10, "kl": 0.0684814453125, "learning_rate": 1.6929661953504495e-06, "loss": 0.0027, "num_tokens": 2542811389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8312708031065973, "frac_reward_zero_std": 1.0, "grad_norm": 6.18239764866583e-10, "kl": 0.0733642578125, "learning_rate": 1.689650824441904e-06, "loss": 0.0029, "num_tokens": 2543376445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8314414952632927, "frac_reward_zero_std": 1.0, "grad_norm": 2.8247446916845535e-10, "kl": 0.071044921875, "learning_rate": 1.6863384034073304e-06, "loss": 0.0028, "num_tokens": 2543943885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8316121874199881, "frac_reward_zero_std": 1.0, "grad_norm": 4.091338253802465e-10, "kl": 0.07275390625, "learning_rate": 1.683028933422517e-06, "loss": 0.0029, "num_tokens": 2544512157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8317828795766834, "frac_reward_zero_std": 1.0, "grad_norm": 7.430361594591934e-10, "kl": 0.0712890625, "learning_rate": 1.6797224156622094e-06, "loss": 0.0028, "num_tokens": 2545072109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8319535717333788, "frac_reward_zero_std": 1.0, "grad_norm": 5.536984857096199e-10, "kl": 0.0714111328125, "learning_rate": 1.6764188513001e-06, "loss": 0.0029, "num_tokens": 2545639517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8321242638900742, "frac_reward_zero_std": 1.0, "grad_norm": 7.10058993544406e-10, "kl": 0.07275390625, "learning_rate": 1.673118241508831e-06, "loss": 0.0029, "num_tokens": 2546209517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8322949560467696, "frac_reward_zero_std": 1.0, "grad_norm": 6.088512532548417e-10, "kl": 0.0706787109375, "learning_rate": 1.6698205874600038e-06, "loss": 0.0028, "num_tokens": 2546775933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.832465648203465, "frac_reward_zero_std": 1.0, "grad_norm": 5.914701874434557e-10, "kl": 0.072509765625, "learning_rate": 1.6665258903241677e-06, "loss": 0.0029, "num_tokens": 2547342573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8326363403601604, "frac_reward_zero_std": 1.0, "grad_norm": 6.389512559174708e-10, "kl": 0.06982421875, "learning_rate": 1.6632341512708194e-06, "loss": 0.0028, "num_tokens": 2547911725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8328070325168558, "frac_reward_zero_std": 1.0, "grad_norm": 7.949834999216959e-10, "kl": 0.069091796875, "learning_rate": 1.6599453714684055e-06, "loss": 0.0028, "num_tokens": 2548477053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8329777246735512, "frac_reward_zero_std": 1.0, "grad_norm": 6.223288517550778e-10, "kl": 0.06787109375, "learning_rate": 1.6566595520843253e-06, "loss": 0.0027, "num_tokens": 2549041517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8331484168302467, "frac_reward_zero_std": 1.0, "grad_norm": 6.856342717202693e-10, "kl": 0.0723876953125, "learning_rate": 1.6533766942849295e-06, "loss": 0.0029, "num_tokens": 2549608093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8333191089869421, "frac_reward_zero_std": 1.0, "grad_norm": 3.6055335192949803e-10, "kl": 0.07421875, "learning_rate": 1.6500967992355122e-06, "loss": 0.003, "num_tokens": 2550177293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8334898011436375, "frac_reward_zero_std": 1.0, "grad_norm": 6.731357007279121e-10, "kl": 0.06982421875, "learning_rate": 1.6468198681003134e-06, "loss": 0.0028, "num_tokens": 2550741597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8336604933003329, "frac_reward_zero_std": 1.0, "grad_norm": 6.510908716098272e-10, "kl": 0.0750732421875, "learning_rate": 1.6435459020425305e-06, "loss": 0.003, "num_tokens": 2551307949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8338311854570283, "frac_reward_zero_std": 1.0, "grad_norm": 3.300900736630351e-10, "kl": 0.0697021484375, "learning_rate": 1.6402749022243026e-06, "loss": 0.0028, "num_tokens": 2551878189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8340018776137237, "frac_reward_zero_std": 1.0, "grad_norm": 6.535790751657827e-10, "kl": 0.066650390625, "learning_rate": 1.6370068698067165e-06, "loss": 0.0027, "num_tokens": 2552459821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8341725697704191, "frac_reward_zero_std": 1.0, "grad_norm": 6.757471996290348e-10, "kl": 0.078125, "learning_rate": 1.6337418059498022e-06, "loss": 0.0031, "num_tokens": 2553021533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8343432619271145, "frac_reward_zero_std": 1.0, "grad_norm": 4.641322401773023e-10, "kl": 0.0684814453125, "learning_rate": 1.6304797118125414e-06, "loss": 0.0027, "num_tokens": 2553590381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8345139540838098, "frac_reward_zero_std": 1.0, "grad_norm": 6.206194208664819e-10, "kl": 0.06640625, "learning_rate": 1.6272205885528648e-06, "loss": 0.0027, "num_tokens": 2554155677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8346846462405052, "frac_reward_zero_std": 1.0, "grad_norm": 6.392576438041105e-10, "kl": 0.0716552734375, "learning_rate": 1.6239644373276375e-06, "loss": 0.0029, "num_tokens": 2554718637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8348553383972006, "frac_reward_zero_std": 1.0, "grad_norm": 7.407618301556285e-10, "kl": 0.0731201171875, "learning_rate": 1.6207112592926766e-06, "loss": 0.0029, "num_tokens": 2555280429.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.835026030553896, "frac_reward_zero_std": 1.0, "grad_norm": 6.278262669653697e-10, "kl": 0.0726318359375, "learning_rate": 1.6174610556027426e-06, "loss": 0.0029, "num_tokens": 2555845981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8351967227105914, "frac_reward_zero_std": 1.0, "grad_norm": 4.684742514051867e-10, "kl": 0.0687255859375, "learning_rate": 1.6142138274115437e-06, "loss": 0.0028, "num_tokens": 2556414333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8353674148672868, "frac_reward_zero_std": 1.0, "grad_norm": 5.54487645642926e-10, "kl": 0.0745849609375, "learning_rate": 1.6109695758717248e-06, "loss": 0.003, "num_tokens": 2556975581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8355381070239822, "frac_reward_zero_std": 1.0, "grad_norm": 7.375089870681545e-10, "kl": 0.070068359375, "learning_rate": 1.6077283021348822e-06, "loss": 0.0028, "num_tokens": 2557536573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8357087991806776, "frac_reward_zero_std": 1.0, "grad_norm": 7.480413839341364e-10, "kl": 0.072509765625, "learning_rate": 1.6044900073515467e-06, "loss": 0.0029, "num_tokens": 2558100077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.835879491337373, "frac_reward_zero_std": 1.0, "grad_norm": 4.897644219545374e-10, "kl": 0.071533203125, "learning_rate": 1.6012546926712003e-06, "loss": 0.0029, "num_tokens": 2558674205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8360501834940685, "frac_reward_zero_std": 1.0, "grad_norm": 5.421070708361441e-10, "kl": 0.0750732421875, "learning_rate": 1.5980223592422583e-06, "loss": 0.003, "num_tokens": 2559237309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8362208756507639, "frac_reward_zero_std": 1.0, "grad_norm": 5.386072835458085e-10, "kl": 0.0687255859375, "learning_rate": 1.5947930082120867e-06, "loss": 0.0028, "num_tokens": 2559804493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8363915678074593, "frac_reward_zero_std": 1.0, "grad_norm": 4.958198352493439e-10, "kl": 0.072265625, "learning_rate": 1.5915666407269847e-06, "loss": 0.0029, "num_tokens": 2560367949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8365622599641547, "frac_reward_zero_std": 1.0, "grad_norm": 6.978911056893481e-10, "kl": 0.068603515625, "learning_rate": 1.5883432579322e-06, "loss": 0.0027, "num_tokens": 2560932925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8367329521208501, "frac_reward_zero_std": 1.0, "grad_norm": 7.565776008380169e-10, "kl": 0.07177734375, "learning_rate": 1.5851228609719138e-06, "loss": 0.0029, "num_tokens": 2561493325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8369036442775455, "frac_reward_zero_std": 1.0, "grad_norm": 5.378523779178892e-10, "kl": 0.071533203125, "learning_rate": 1.581905450989254e-06, "loss": 0.0029, "num_tokens": 2562061437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8370743364342409, "frac_reward_zero_std": 1.0, "grad_norm": 3.854879246329676e-10, "kl": 0.068359375, "learning_rate": 1.5786910291262814e-06, "loss": 0.0027, "num_tokens": 2562631309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8372450285909362, "frac_reward_zero_std": 1.0, "grad_norm": 4.415701667668642e-10, "kl": 0.0709228515625, "learning_rate": 1.5754795965240043e-06, "loss": 0.0028, "num_tokens": 2563199389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8374157207476316, "frac_reward_zero_std": 1.0, "grad_norm": 5.44097977703069e-10, "kl": 0.070068359375, "learning_rate": 1.5722711543223601e-06, "loss": 0.0028, "num_tokens": 2563762333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.837586412904327, "frac_reward_zero_std": 1.0, "grad_norm": 5.870313099570923e-10, "kl": 0.0709228515625, "learning_rate": 1.5690657036602352e-06, "loss": 0.0028, "num_tokens": 2564325389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8377571050610224, "frac_reward_zero_std": 1.0, "grad_norm": 5.694427460440372e-10, "kl": 0.0889892578125, "learning_rate": 1.565863245675443e-06, "loss": 0.0036, "num_tokens": 2564890717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8379277972177178, "frac_reward_zero_std": 1.0, "grad_norm": 5.219459108516504e-10, "kl": 0.0716552734375, "learning_rate": 1.5626637815047475e-06, "loss": 0.0029, "num_tokens": 2565458781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8380984893744132, "frac_reward_zero_std": 1.0, "grad_norm": 5.117741620427004e-10, "kl": 0.0706787109375, "learning_rate": 1.5594673122838355e-06, "loss": 0.0028, "num_tokens": 2566026589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8382691815311086, "frac_reward_zero_std": 1.0, "grad_norm": 5.427624733777666e-10, "kl": 0.06787109375, "learning_rate": 1.556273839147342e-06, "loss": 0.0027, "num_tokens": 2566592445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.838439873687804, "frac_reward_zero_std": 1.0, "grad_norm": 6.568472875594317e-10, "kl": 0.0714111328125, "learning_rate": 1.5530833632288377e-06, "loss": 0.0029, "num_tokens": 2567155933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8386105658444994, "frac_reward_zero_std": 1.0, "grad_norm": 6.275531257902823e-10, "kl": 0.0723876953125, "learning_rate": 1.5498958856608227e-06, "loss": 0.0029, "num_tokens": 2567718845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8387812580011949, "frac_reward_zero_std": 1.0, "grad_norm": 7.155288011507762e-10, "kl": 0.076171875, "learning_rate": 1.5467114075747347e-06, "loss": 0.003, "num_tokens": 2568279741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8389519501578903, "frac_reward_zero_std": 1.0, "grad_norm": 5.291957137536145e-10, "kl": 0.06787109375, "learning_rate": 1.543529930100952e-06, "loss": 0.0027, "num_tokens": 2568844669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8391226423145857, "frac_reward_zero_std": 1.0, "grad_norm": 8.002012064957087e-10, "kl": 0.071044921875, "learning_rate": 1.5403514543687837e-06, "loss": 0.0028, "num_tokens": 2569411725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8392933344712811, "frac_reward_zero_std": 1.0, "grad_norm": 6.292122160541666e-10, "kl": 0.0733642578125, "learning_rate": 1.5371759815064745e-06, "loss": 0.0029, "num_tokens": 2569973373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8394640266279765, "frac_reward_zero_std": 1.0, "grad_norm": 3.163800548195034e-10, "kl": 0.068359375, "learning_rate": 1.534003512641199e-06, "loss": 0.0027, "num_tokens": 2570544109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8396347187846719, "frac_reward_zero_std": 1.0, "grad_norm": 5.199302636841474e-10, "kl": 0.072998046875, "learning_rate": 1.5308340488990713e-06, "loss": 0.0029, "num_tokens": 2571108029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8398054109413673, "frac_reward_zero_std": 1.0, "grad_norm": 7.138136050390999e-10, "kl": 0.0697021484375, "learning_rate": 1.5276675914051387e-06, "loss": 0.0028, "num_tokens": 2571669773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8399761030980627, "frac_reward_zero_std": 1.0, "grad_norm": 5.773320157749935e-10, "kl": 0.071533203125, "learning_rate": 1.5245041412833783e-06, "loss": 0.0029, "num_tokens": 2572231373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.840146795254758, "frac_reward_zero_std": 1.0, "grad_norm": 6.911401530658423e-10, "kl": 0.07177734375, "learning_rate": 1.5213436996566976e-06, "loss": 0.0029, "num_tokens": 2572792333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8403174874114534, "frac_reward_zero_std": 1.0, "grad_norm": 6.133756389370108e-10, "kl": 0.070068359375, "learning_rate": 1.5181862676469404e-06, "loss": 0.0028, "num_tokens": 2573356845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8404881795681488, "frac_reward_zero_std": 1.0, "grad_norm": 5.681006510647624e-10, "kl": 0.0716552734375, "learning_rate": 1.5150318463748858e-06, "loss": 0.0029, "num_tokens": 2573920589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8406588717248442, "frac_reward_zero_std": 1.0, "grad_norm": 3.9713511327105933e-10, "kl": 0.0731201171875, "learning_rate": 1.5118804369602358e-06, "loss": 0.0029, "num_tokens": 2574492477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8408295638815396, "frac_reward_zero_std": 1.0, "grad_norm": 6.286958152417205e-10, "kl": 0.0694580078125, "learning_rate": 1.5087320405216256e-06, "loss": 0.0028, "num_tokens": 2575055885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.841000256038235, "frac_reward_zero_std": 1.0, "grad_norm": 7.555919632132786e-10, "kl": 0.0704345703125, "learning_rate": 1.5055866581766232e-06, "loss": 0.0028, "num_tokens": 2575620893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8411709481949304, "frac_reward_zero_std": 1.0, "grad_norm": 5.994588505319477e-10, "kl": 0.07666015625, "learning_rate": 1.5024442910417257e-06, "loss": 0.0031, "num_tokens": 2576185357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8413416403516258, "frac_reward_zero_std": 1.0, "grad_norm": 5.129505786095779e-10, "kl": 0.0687255859375, "learning_rate": 1.4993049402323656e-06, "loss": 0.0028, "num_tokens": 2576760173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8415123325083212, "frac_reward_zero_std": 1.0, "grad_norm": 4.1745201014635503e-10, "kl": 0.0775146484375, "learning_rate": 1.49616860686289e-06, "loss": 0.0031, "num_tokens": 2577328845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8416830246650167, "frac_reward_zero_std": 1.0, "grad_norm": 4.904995610370952e-10, "kl": 0.0703125, "learning_rate": 1.4930352920465886e-06, "loss": 0.0028, "num_tokens": 2577895117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8418537168217121, "frac_reward_zero_std": 1.0, "grad_norm": 4.2666717468818384e-10, "kl": 0.0792236328125, "learning_rate": 1.4899049968956747e-06, "loss": 0.0032, "num_tokens": 2578462509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8420244089784075, "frac_reward_zero_std": 1.0, "grad_norm": 6.190190050554246e-10, "kl": 0.0701904296875, "learning_rate": 1.4867777225212943e-06, "loss": 0.0028, "num_tokens": 2579027469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8421951011351029, "frac_reward_zero_std": 1.0, "grad_norm": 6.044459949594564e-10, "kl": 0.0677490234375, "learning_rate": 1.483653470033508e-06, "loss": 0.0027, "num_tokens": 2579591581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8423657932917983, "frac_reward_zero_std": 1.0, "grad_norm": 6.343332823229295e-10, "kl": 0.0711669921875, "learning_rate": 1.4805322405413181e-06, "loss": 0.0028, "num_tokens": 2580162381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8425364854484937, "frac_reward_zero_std": 1.0, "grad_norm": 4.4172210742177056e-10, "kl": 0.068359375, "learning_rate": 1.4774140351526468e-06, "loss": 0.0027, "num_tokens": 2580729261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8427071776051891, "frac_reward_zero_std": 1.0, "grad_norm": 7.218666612397803e-10, "kl": 0.0709228515625, "learning_rate": 1.4742988549743476e-06, "loss": 0.0028, "num_tokens": 2581292525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8428778697618844, "frac_reward_zero_std": 1.0, "grad_norm": 6.318716637177855e-10, "kl": 0.0716552734375, "learning_rate": 1.471186701112195e-06, "loss": 0.0029, "num_tokens": 2581860973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8430485619185798, "frac_reward_zero_std": 1.0, "grad_norm": 7.222835804941174e-10, "kl": 0.076171875, "learning_rate": 1.4680775746708898e-06, "loss": 0.003, "num_tokens": 2582422157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8432192540752752, "frac_reward_zero_std": 1.0, "grad_norm": 7.281418471699214e-10, "kl": 0.0726318359375, "learning_rate": 1.464971476754059e-06, "loss": 0.0029, "num_tokens": 2582985117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8433899462319706, "frac_reward_zero_std": 1.0, "grad_norm": 3.8782681246654226e-10, "kl": 0.06982421875, "learning_rate": 1.461868408464261e-06, "loss": 0.0028, "num_tokens": 2583551245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.843560638388666, "frac_reward_zero_std": 1.0, "grad_norm": 5.512001920910886e-10, "kl": 0.0692138671875, "learning_rate": 1.4587683709029688e-06, "loss": 0.0028, "num_tokens": 2584114109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8437313305453614, "frac_reward_zero_std": 1.0, "grad_norm": 1.0215538027784674e-09, "kl": 0.076416015625, "learning_rate": 1.455671365170581e-06, "loss": 0.0031, "num_tokens": 2584676189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8439020227020568, "frac_reward_zero_std": 1.0, "grad_norm": 5.402268910070305e-10, "kl": 0.0701904296875, "learning_rate": 1.4525773923664267e-06, "loss": 0.0028, "num_tokens": 2585243053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8440727148587522, "frac_reward_zero_std": 1.0, "grad_norm": 6.087018178929825e-10, "kl": 0.0718994140625, "learning_rate": 1.4494864535887564e-06, "loss": 0.0029, "num_tokens": 2585805885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8442434070154476, "frac_reward_zero_std": 1.0, "grad_norm": 6.012354520102114e-10, "kl": 0.0716552734375, "learning_rate": 1.446398549934739e-06, "loss": 0.0029, "num_tokens": 2586372957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.844414099172143, "frac_reward_zero_std": 1.0, "grad_norm": 6.600344980699904e-10, "kl": 0.07421875, "learning_rate": 1.4433136825004668e-06, "loss": 0.003, "num_tokens": 2586934701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8445847913288385, "frac_reward_zero_std": 1.0, "grad_norm": 5.829088559450754e-10, "kl": 0.069091796875, "learning_rate": 1.440231852380959e-06, "loss": 0.0028, "num_tokens": 2587499805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8447554834855339, "frac_reward_zero_std": 1.0, "grad_norm": 5.212438333634799e-10, "kl": 0.0670166015625, "learning_rate": 1.4371530606701544e-06, "loss": 0.0027, "num_tokens": 2588065389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8449261756422293, "frac_reward_zero_std": 1.0, "grad_norm": 4.899656327636004e-10, "kl": 0.06884765625, "learning_rate": 1.4340773084609138e-06, "loss": 0.0028, "num_tokens": 2588641629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8450968677989247, "frac_reward_zero_std": 1.0, "grad_norm": 4.803596789255346e-10, "kl": 0.0704345703125, "learning_rate": 1.4310045968450148e-06, "loss": 0.0028, "num_tokens": 2589216637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8452675599556201, "frac_reward_zero_std": 1.0, "grad_norm": 5.44593782301734e-10, "kl": 0.0687255859375, "learning_rate": 1.4279349269131593e-06, "loss": 0.0027, "num_tokens": 2589781917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8454382521123155, "frac_reward_zero_std": 1.0, "grad_norm": 6.239385451550517e-10, "kl": 0.06884765625, "learning_rate": 1.4248682997549744e-06, "loss": 0.0028, "num_tokens": 2590346365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8456089442690108, "frac_reward_zero_std": 1.0, "grad_norm": 7.502144319702405e-10, "kl": 0.0732421875, "learning_rate": 1.4218047164589966e-06, "loss": 0.0029, "num_tokens": 2590913581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8457796364257062, "frac_reward_zero_std": 1.0, "grad_norm": 4.623992229321361e-10, "kl": 0.0703125, "learning_rate": 1.418744178112691e-06, "loss": 0.0028, "num_tokens": 2591482925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8459503285824016, "frac_reward_zero_std": 1.0, "grad_norm": 6.56397842898951e-10, "kl": 0.0703125, "learning_rate": 1.4156866858024343e-06, "loss": 0.0028, "num_tokens": 2592046669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.846121020739097, "frac_reward_zero_std": 1.0, "grad_norm": 7.755255490843383e-10, "kl": 0.074462890625, "learning_rate": 1.4126322406135306e-06, "loss": 0.003, "num_tokens": 2592605197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8462917128957924, "frac_reward_zero_std": 1.0, "grad_norm": 6.333319630200572e-10, "kl": 0.069091796875, "learning_rate": 1.4095808436301928e-06, "loss": 0.0028, "num_tokens": 2593167389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8464624050524878, "frac_reward_zero_std": 1.0, "grad_norm": 6.564953889296108e-10, "kl": 0.070068359375, "learning_rate": 1.406532495935562e-06, "loss": 0.0028, "num_tokens": 2593728685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8466330972091832, "frac_reward_zero_std": 1.0, "grad_norm": 4.302372788381445e-10, "kl": 0.0682373046875, "learning_rate": 1.403487198611686e-06, "loss": 0.0027, "num_tokens": 2594297165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8468037893658786, "frac_reward_zero_std": 1.0, "grad_norm": 5.77680675026147e-10, "kl": 0.0751953125, "learning_rate": 1.4004449527395415e-06, "loss": 0.003, "num_tokens": 2594865565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.846974481522574, "frac_reward_zero_std": 1.0, "grad_norm": 6.220381237505264e-10, "kl": 0.071044921875, "learning_rate": 1.3974057593990086e-06, "loss": 0.0028, "num_tokens": 2595428941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8471451736792694, "frac_reward_zero_std": 1.0, "grad_norm": 4.3430800816669014e-10, "kl": 0.0682373046875, "learning_rate": 1.3943696196688993e-06, "loss": 0.0027, "num_tokens": 2595997101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8473158658359649, "frac_reward_zero_std": 1.0, "grad_norm": 6.675230512400708e-10, "kl": 0.0743408203125, "learning_rate": 1.3913365346269269e-06, "loss": 0.003, "num_tokens": 2596562301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8474865579926603, "frac_reward_zero_std": 1.0, "grad_norm": 6.10350864056292e-10, "kl": 0.0694580078125, "learning_rate": 1.3883065053497313e-06, "loss": 0.0028, "num_tokens": 2597127869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8476572501493557, "frac_reward_zero_std": 1.0, "grad_norm": 2.8412014590028546e-10, "kl": 0.0703125, "learning_rate": 1.385279532912861e-06, "loss": 0.0028, "num_tokens": 2597699453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8478279423060511, "frac_reward_zero_std": 1.0, "grad_norm": 6.320576080171984e-10, "kl": 0.0721435546875, "learning_rate": 1.3822556183907842e-06, "loss": 0.0029, "num_tokens": 2598268253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8479986344627465, "frac_reward_zero_std": 1.0, "grad_norm": 6.476843581337393e-10, "kl": 0.0736083984375, "learning_rate": 1.379234762856878e-06, "loss": 0.0029, "num_tokens": 2598833053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8481693266194419, "frac_reward_zero_std": 1.0, "grad_norm": 5.944605896817786e-10, "kl": 0.0704345703125, "learning_rate": 1.3762169673834413e-06, "loss": 0.0028, "num_tokens": 2599398269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8483400187761372, "frac_reward_zero_std": 1.0, "grad_norm": 3.592993312173022e-10, "kl": 0.070556640625, "learning_rate": 1.3732022330416795e-06, "loss": 0.0028, "num_tokens": 2599965197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8485107109328326, "frac_reward_zero_std": 1.0, "grad_norm": 6.630266538220678e-10, "kl": 0.0733642578125, "learning_rate": 1.370190560901714e-06, "loss": 0.0029, "num_tokens": 2600524141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.848681403089528, "frac_reward_zero_std": 1.0, "grad_norm": 4.688503634715311e-10, "kl": 0.0693359375, "learning_rate": 1.3671819520325847e-06, "loss": 0.0028, "num_tokens": 2601090621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8488520952462234, "frac_reward_zero_std": 1.0, "grad_norm": 4.6178704647529144e-10, "kl": 0.0694580078125, "learning_rate": 1.3641764075022345e-06, "loss": 0.0028, "num_tokens": 2601666045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8490227874029188, "frac_reward_zero_std": 1.0, "grad_norm": 4.4199479930926704e-10, "kl": 0.070068359375, "learning_rate": 1.3611739283775227e-06, "loss": 0.0028, "num_tokens": 2602230445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8491934795596142, "frac_reward_zero_std": 1.0, "grad_norm": 6.54659860356835e-10, "kl": 0.0703125, "learning_rate": 1.358174515724222e-06, "loss": 0.0028, "num_tokens": 2602790701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8493641717163096, "frac_reward_zero_std": 1.0, "grad_norm": 5.771104996796209e-10, "kl": 0.0714111328125, "learning_rate": 1.355178170607019e-06, "loss": 0.0029, "num_tokens": 2603355773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.849534863873005, "frac_reward_zero_std": 1.0, "grad_norm": 5.95918800543555e-10, "kl": 0.0721435546875, "learning_rate": 1.3521848940895043e-06, "loss": 0.0029, "num_tokens": 2603918445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8497055560297004, "frac_reward_zero_std": 1.0, "grad_norm": 4.514285032645367e-10, "kl": 0.06884765625, "learning_rate": 1.3491946872341833e-06, "loss": 0.0028, "num_tokens": 2604487741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8498762481863958, "frac_reward_zero_std": 1.0, "grad_norm": 5.42868592944196e-10, "kl": 0.07373046875, "learning_rate": 1.3462075511024708e-06, "loss": 0.0029, "num_tokens": 2605052029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8500469403430913, "frac_reward_zero_std": 1.0, "grad_norm": 5.931792650686993e-10, "kl": 0.0711669921875, "learning_rate": 1.3432234867546968e-06, "loss": 0.0028, "num_tokens": 2605613245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8502176324997867, "frac_reward_zero_std": 1.0, "grad_norm": 6.108231254310791e-10, "kl": 0.0709228515625, "learning_rate": 1.3402424952500937e-06, "loss": 0.0028, "num_tokens": 2606185245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8503883246564821, "frac_reward_zero_std": 1.0, "grad_norm": 6.314246278766875e-10, "kl": 0.075439453125, "learning_rate": 1.3372645776468018e-06, "loss": 0.003, "num_tokens": 2606749245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8505590168131775, "frac_reward_zero_std": 1.0, "grad_norm": 3.619512041329712e-10, "kl": 0.071044921875, "learning_rate": 1.3342897350018792e-06, "loss": 0.0028, "num_tokens": 2607319965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8507297089698729, "frac_reward_zero_std": 1.0, "grad_norm": 5.494920194420723e-10, "kl": 0.0751953125, "learning_rate": 1.3313179683712884e-06, "loss": 0.003, "num_tokens": 2607887901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8509004011265683, "frac_reward_zero_std": 1.0, "grad_norm": 5.470730761804988e-10, "kl": 0.0703125, "learning_rate": 1.3283492788098972e-06, "loss": 0.0028, "num_tokens": 2608454749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8510710932832636, "frac_reward_zero_std": 1.0, "grad_norm": 5.051580055886484e-10, "kl": 0.0704345703125, "learning_rate": 1.3253836673714815e-06, "loss": 0.0028, "num_tokens": 2609016477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.851241785439959, "frac_reward_zero_std": 1.0, "grad_norm": 5.395167708215249e-10, "kl": 0.0728759765625, "learning_rate": 1.3224211351087291e-06, "loss": 0.0029, "num_tokens": 2609581293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8514124775966544, "frac_reward_zero_std": 1.0, "grad_norm": 6.035475141314282e-10, "kl": 0.077880859375, "learning_rate": 1.319461683073232e-06, "loss": 0.0031, "num_tokens": 2610147373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8515831697533498, "frac_reward_zero_std": 1.0, "grad_norm": 6.460885498427749e-10, "kl": 0.074462890625, "learning_rate": 1.316505312315488e-06, "loss": 0.003, "num_tokens": 2610711453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8517538619100452, "frac_reward_zero_std": 1.0, "grad_norm": 5.123091541768944e-10, "kl": 0.0712890625, "learning_rate": 1.3135520238848997e-06, "loss": 0.0029, "num_tokens": 2611273645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8519245540667406, "frac_reward_zero_std": 1.0, "grad_norm": 5.362740343471667e-10, "kl": 0.070556640625, "learning_rate": 1.3106018188297808e-06, "loss": 0.0028, "num_tokens": 2611840989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.852095246223436, "frac_reward_zero_std": 1.0, "grad_norm": 4.6119443720759667e-10, "kl": 0.07177734375, "learning_rate": 1.307654698197347e-06, "loss": 0.0029, "num_tokens": 2612410685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8522659383801314, "frac_reward_zero_std": 1.0, "grad_norm": 4.523454449751339e-10, "kl": 0.0701904296875, "learning_rate": 1.3047106630337236e-06, "loss": 0.0028, "num_tokens": 2612977437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8524366305368268, "frac_reward_zero_std": 1.0, "grad_norm": 6.965002710868528e-10, "kl": 0.0684814453125, "learning_rate": 1.3017697143839291e-06, "loss": 0.0027, "num_tokens": 2613541309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8526073226935222, "frac_reward_zero_std": 1.0, "grad_norm": 4.315113470123669e-10, "kl": 0.0689697265625, "learning_rate": 1.2988318532918975e-06, "loss": 0.0028, "num_tokens": 2614117069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8527780148502176, "frac_reward_zero_std": 1.0, "grad_norm": 6.278003932203205e-10, "kl": 0.0780029296875, "learning_rate": 1.2958970808004634e-06, "loss": 0.0031, "num_tokens": 2614685069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.852948707006913, "frac_reward_zero_std": 1.0, "grad_norm": 5.26065279324001e-10, "kl": 0.069091796875, "learning_rate": 1.292965397951369e-06, "loss": 0.0028, "num_tokens": 2615251709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8531193991636085, "frac_reward_zero_std": 1.0, "grad_norm": 3.698118399163228e-10, "kl": 0.0693359375, "learning_rate": 1.2900368057852507e-06, "loss": 0.0028, "num_tokens": 2615821357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8532900913203039, "frac_reward_zero_std": 1.0, "grad_norm": 4.574015348764e-10, "kl": 0.071533203125, "learning_rate": 1.2871113053416539e-06, "loss": 0.0029, "num_tokens": 2616391485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 4999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8534607834769993, "frac_reward_zero_std": 1.0, "grad_norm": 4.609865516555687e-10, "kl": 0.0738525390625, "learning_rate": 1.2841888976590255e-06, "loss": 0.003, "num_tokens": 2616962861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8536314756336947, "frac_reward_zero_std": 1.0, "grad_norm": 4.345818879993065e-10, "kl": 0.07470703125, "learning_rate": 1.2812695837747192e-06, "loss": 0.003, "num_tokens": 2617529885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.85380216779039, "frac_reward_zero_std": 1.0, "grad_norm": 6.076111891866278e-10, "kl": 0.071533203125, "learning_rate": 1.2783533647249814e-06, "loss": 0.0029, "num_tokens": 2618093773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8539728599470854, "frac_reward_zero_std": 1.0, "grad_norm": 5.089507931254244e-10, "kl": 0.0704345703125, "learning_rate": 1.2754402415449641e-06, "loss": 0.0028, "num_tokens": 2618662141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8541435521037808, "frac_reward_zero_std": 1.0, "grad_norm": 5.39655740075103e-10, "kl": 0.0711669921875, "learning_rate": 1.2725302152687224e-06, "loss": 0.0028, "num_tokens": 2619227613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8543142442604762, "frac_reward_zero_std": 1.0, "grad_norm": 6.774508725203528e-10, "kl": 0.0716552734375, "learning_rate": 1.2696232869292136e-06, "loss": 0.0029, "num_tokens": 2619789789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8544849364171716, "frac_reward_zero_std": 1.0, "grad_norm": 7.325032903158267e-10, "kl": 0.072509765625, "learning_rate": 1.2667194575582886e-06, "loss": 0.0029, "num_tokens": 2620353981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.854655628573867, "frac_reward_zero_std": 1.0, "grad_norm": 5.60555086397175e-10, "kl": 0.069580078125, "learning_rate": 1.263818728186701e-06, "loss": 0.0028, "num_tokens": 2620916493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8548263207305624, "frac_reward_zero_std": 1.0, "grad_norm": 5.402212516590643e-10, "kl": 0.0677490234375, "learning_rate": 1.2609210998441067e-06, "loss": 0.0027, "num_tokens": 2621477533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8549970128872578, "frac_reward_zero_std": 1.0, "grad_norm": 4.805256958181511e-10, "kl": 0.0716552734375, "learning_rate": 1.2580265735590612e-06, "loss": 0.0029, "num_tokens": 2622042333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8551677050439532, "frac_reward_zero_std": 1.0, "grad_norm": 7.568167007314926e-10, "kl": 0.073974609375, "learning_rate": 1.2551351503590148e-06, "loss": 0.003, "num_tokens": 2622601597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8553383972006486, "frac_reward_zero_std": 1.0, "grad_norm": 4.41854203850402e-10, "kl": 0.0745849609375, "learning_rate": 1.2522468312703172e-06, "loss": 0.003, "num_tokens": 2623169181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.855509089357344, "frac_reward_zero_std": 1.0, "grad_norm": 5.57751527168782e-10, "kl": 0.0682373046875, "learning_rate": 1.249361617318219e-06, "loss": 0.0027, "num_tokens": 2623733421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8556797815140395, "frac_reward_zero_std": 1.0, "grad_norm": 4.5075716824332973e-10, "kl": 0.07177734375, "learning_rate": 1.2464795095268678e-06, "loss": 0.0029, "num_tokens": 2624301869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8558504736707349, "frac_reward_zero_std": 1.0, "grad_norm": 5.214753742972538e-10, "kl": 0.0701904296875, "learning_rate": 1.2436005089193047e-06, "loss": 0.0028, "num_tokens": 2624866749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8560211658274303, "frac_reward_zero_std": 1.0, "grad_norm": 6.953018879363493e-10, "kl": 0.0723876953125, "learning_rate": 1.2407246165174757e-06, "loss": 0.0029, "num_tokens": 2625431405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8561918579841257, "frac_reward_zero_std": 1.0, "grad_norm": 5.481006421965947e-10, "kl": 0.0791015625, "learning_rate": 1.2378518333422141e-06, "loss": 0.0032, "num_tokens": 2625998829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8563625501408211, "frac_reward_zero_std": 1.0, "grad_norm": 4.006022805549843e-10, "kl": 0.070068359375, "learning_rate": 1.2349821604132584e-06, "loss": 0.0028, "num_tokens": 2626566109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8565332422975165, "frac_reward_zero_std": 1.0, "grad_norm": 4.544153326327199e-10, "kl": 0.06787109375, "learning_rate": 1.232115598749234e-06, "loss": 0.0027, "num_tokens": 2627130429.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8567039344542118, "frac_reward_zero_std": 1.0, "grad_norm": 5.348391809544424e-10, "kl": 0.0714111328125, "learning_rate": 1.2292521493676724e-06, "loss": 0.0029, "num_tokens": 2627703677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8568746266109072, "frac_reward_zero_std": 1.0, "grad_norm": 6.061306156041197e-10, "kl": 0.0697021484375, "learning_rate": 1.2263918132849894e-06, "loss": 0.0028, "num_tokens": 2628268653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8570453187676026, "frac_reward_zero_std": 1.0, "grad_norm": 5.520970965301852e-10, "kl": 0.06689453125, "learning_rate": 1.2235345915165065e-06, "loss": 0.0027, "num_tokens": 2628838557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.857216010924298, "frac_reward_zero_std": 1.0, "grad_norm": 3.9393463796506336e-10, "kl": 0.0733642578125, "learning_rate": 1.2206804850764287e-06, "loss": 0.0029, "num_tokens": 2629411501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8573867030809934, "frac_reward_zero_std": 1.0, "grad_norm": 7.330954488953625e-10, "kl": 0.0714111328125, "learning_rate": 1.2178294949778657e-06, "loss": 0.0029, "num_tokens": 2629975037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8575573952376888, "frac_reward_zero_std": 1.0, "grad_norm": 5.641140626090388e-10, "kl": 0.0701904296875, "learning_rate": 1.2149816222328115e-06, "loss": 0.0028, "num_tokens": 2630538925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8577280873943842, "frac_reward_zero_std": 1.0, "grad_norm": 4.848085310248514e-10, "kl": 0.0709228515625, "learning_rate": 1.2121368678521628e-06, "loss": 0.0028, "num_tokens": 2631109885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8578987795510796, "frac_reward_zero_std": 1.0, "grad_norm": 4.883696340303449e-10, "kl": 0.0738525390625, "learning_rate": 1.2092952328456998e-06, "loss": 0.003, "num_tokens": 2631673805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.858069471707775, "frac_reward_zero_std": 1.0, "grad_norm": 5.393540527704487e-10, "kl": 0.0684814453125, "learning_rate": 1.2064567182221055e-06, "loss": 0.0027, "num_tokens": 2632239997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8582401638644704, "frac_reward_zero_std": 1.0, "grad_norm": 5.171237201524848e-10, "kl": 0.0684814453125, "learning_rate": 1.2036213249889462e-06, "loss": 0.0027, "num_tokens": 2632809341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8584108560211658, "frac_reward_zero_std": 1.0, "grad_norm": 6.503237321118665e-10, "kl": 0.0693359375, "learning_rate": 1.2007890541526867e-06, "loss": 0.0028, "num_tokens": 2633374685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8585815481778613, "frac_reward_zero_std": 1.0, "grad_norm": 5.651784133934118e-10, "kl": 0.0740966796875, "learning_rate": 1.1979599067186776e-06, "loss": 0.003, "num_tokens": 2633939741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8587522403345567, "frac_reward_zero_std": 1.0, "grad_norm": 3.742028892297098e-10, "kl": 0.0711669921875, "learning_rate": 1.1951338836911674e-06, "loss": 0.0028, "num_tokens": 2634507757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8589229324912521, "frac_reward_zero_std": 1.0, "grad_norm": 5.084199078059144e-10, "kl": 0.0718994140625, "learning_rate": 1.1923109860732906e-06, "loss": 0.0029, "num_tokens": 2635072557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8590936246479475, "frac_reward_zero_std": 1.0, "grad_norm": 4.3314942058646e-10, "kl": 0.0709228515625, "learning_rate": 1.1894912148670755e-06, "loss": 0.0028, "num_tokens": 2635640509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8592643168046429, "frac_reward_zero_std": 1.0, "grad_norm": 6.006495307515119e-10, "kl": 0.068115234375, "learning_rate": 1.1866745710734363e-06, "loss": 0.0027, "num_tokens": 2636202189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8594350089613382, "frac_reward_zero_std": 1.0, "grad_norm": 6.521575881784587e-10, "kl": 0.0755615234375, "learning_rate": 1.1838610556921814e-06, "loss": 0.003, "num_tokens": 2636768029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8596057011180336, "frac_reward_zero_std": 1.0, "grad_norm": 4.719453865563466e-10, "kl": 0.0711669921875, "learning_rate": 1.1810506697220104e-06, "loss": 0.0029, "num_tokens": 2637336701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.859776393274729, "frac_reward_zero_std": 1.0, "grad_norm": 6.094482087175133e-10, "kl": 0.068115234375, "learning_rate": 1.1782434141605048e-06, "loss": 0.0027, "num_tokens": 2637905101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8599470854314244, "frac_reward_zero_std": 1.0, "grad_norm": 6.433433195010932e-10, "kl": 0.0712890625, "learning_rate": 1.1754392900041388e-06, "loss": 0.0029, "num_tokens": 2638474989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8601177775881198, "frac_reward_zero_std": 1.0, "grad_norm": 3.971083432846284e-10, "kl": 0.07080078125, "learning_rate": 1.1726382982482753e-06, "loss": 0.0028, "num_tokens": 2639045245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8602884697448152, "frac_reward_zero_std": 1.0, "grad_norm": 7.766607354537079e-10, "kl": 0.0733642578125, "learning_rate": 1.1698404398871699e-06, "loss": 0.0029, "num_tokens": 2639607821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8604591619015106, "frac_reward_zero_std": 1.0, "grad_norm": 4.213911880303913e-10, "kl": 0.0703125, "learning_rate": 1.167045715913957e-06, "loss": 0.0028, "num_tokens": 2640177165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.860629854058206, "frac_reward_zero_std": 1.0, "grad_norm": 5.347431019800177e-10, "kl": 0.0670166015625, "learning_rate": 1.164254127320662e-06, "loss": 0.0027, "num_tokens": 2640742493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8608005462149014, "frac_reward_zero_std": 1.0, "grad_norm": 6.147354734062524e-10, "kl": 0.0712890625, "learning_rate": 1.161465675098199e-06, "loss": 0.0028, "num_tokens": 2641307341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8609712383715968, "frac_reward_zero_std": 1.0, "grad_norm": 6.30462016039991e-10, "kl": 0.072265625, "learning_rate": 1.1586803602363717e-06, "loss": 0.0029, "num_tokens": 2641891373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8611419305282922, "frac_reward_zero_std": 1.0, "grad_norm": 5.096318945374434e-10, "kl": 0.06689453125, "learning_rate": 1.1558981837238626e-06, "loss": 0.0027, "num_tokens": 2642458717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8613126226849876, "frac_reward_zero_std": 1.0, "grad_norm": 4.898557725779481e-10, "kl": 0.0697021484375, "learning_rate": 1.1531191465482417e-06, "loss": 0.0028, "num_tokens": 2643022125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8614833148416831, "frac_reward_zero_std": 1.0, "grad_norm": 5.526389159192087e-10, "kl": 0.069091796875, "learning_rate": 1.150343249695971e-06, "loss": 0.0028, "num_tokens": 2643584477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8616540069983785, "frac_reward_zero_std": 1.0, "grad_norm": 5.55443850505012e-10, "kl": 0.0693359375, "learning_rate": 1.1475704941523934e-06, "loss": 0.0028, "num_tokens": 2644156429.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8618246991550739, "frac_reward_zero_std": 1.0, "grad_norm": 8.350220946892775e-10, "kl": 0.07275390625, "learning_rate": 1.1448008809017376e-06, "loss": 0.0029, "num_tokens": 2644717821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8619953913117693, "frac_reward_zero_std": 1.0, "grad_norm": 5.17487291758937e-10, "kl": 0.07080078125, "learning_rate": 1.142034410927111e-06, "loss": 0.0028, "num_tokens": 2645283789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8621660834684646, "frac_reward_zero_std": 1.0, "grad_norm": 3.6396656882322697e-10, "kl": 0.066650390625, "learning_rate": 1.1392710852105149e-06, "loss": 0.0027, "num_tokens": 2645850685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.86233677562516, "frac_reward_zero_std": 1.0, "grad_norm": 6.292949184433767e-10, "kl": 0.070556640625, "learning_rate": 1.1365109047328294e-06, "loss": 0.0028, "num_tokens": 2646421469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8625074677818554, "frac_reward_zero_std": 1.0, "grad_norm": 3.639473719473336e-10, "kl": 0.0684814453125, "learning_rate": 1.1337538704738239e-06, "loss": 0.0027, "num_tokens": 2646993293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8626781599385508, "frac_reward_zero_std": 1.0, "grad_norm": 4.647577191591855e-10, "kl": 0.075439453125, "learning_rate": 1.1309999834121365e-06, "loss": 0.003, "num_tokens": 2647556749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8628488520952462, "frac_reward_zero_std": 1.0, "grad_norm": 4.0513357136717477e-10, "kl": 0.0694580078125, "learning_rate": 1.1282492445253035e-06, "loss": 0.0028, "num_tokens": 2648128557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8630195442519416, "frac_reward_zero_std": 1.0, "grad_norm": 3.92827011716531e-10, "kl": 0.0667724609375, "learning_rate": 1.1255016547897358e-06, "loss": 0.0027, "num_tokens": 2648706669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.863190236408637, "frac_reward_zero_std": 1.0, "grad_norm": 5.91686153241732e-10, "kl": 0.06884765625, "learning_rate": 1.122757215180732e-06, "loss": 0.0028, "num_tokens": 2649270141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8633609285653324, "frac_reward_zero_std": 1.0, "grad_norm": 5.114262774688598e-10, "kl": 0.06982421875, "learning_rate": 1.1200159266724675e-06, "loss": 0.0028, "num_tokens": 2649837053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8635316207220278, "frac_reward_zero_std": 1.0, "grad_norm": 4.629717846872983e-10, "kl": 0.0672607421875, "learning_rate": 1.1172777902379972e-06, "loss": 0.0027, "num_tokens": 2650400973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8637023128787232, "frac_reward_zero_std": 1.0, "grad_norm": 8.134581535799509e-10, "kl": 0.0755615234375, "learning_rate": 1.1145428068492635e-06, "loss": 0.003, "num_tokens": 2650963085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8638730050354186, "frac_reward_zero_std": 1.0, "grad_norm": 3.5690653630951276e-10, "kl": 0.0655517578125, "learning_rate": 1.1118109774770892e-06, "loss": 0.0026, "num_tokens": 2651531421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.864043697192114, "frac_reward_zero_std": 1.0, "grad_norm": 6.49883951707771e-10, "kl": 0.0732421875, "learning_rate": 1.1090823030911725e-06, "loss": 0.0029, "num_tokens": 2652092621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8642143893488095, "frac_reward_zero_std": 1.0, "grad_norm": 4.2822283688017617e-10, "kl": 0.0665283203125, "learning_rate": 1.106356784660092e-06, "loss": 0.0027, "num_tokens": 2652660317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8643850815055049, "frac_reward_zero_std": 1.0, "grad_norm": 5.083290627624836e-10, "kl": 0.0692138671875, "learning_rate": 1.1036344231513107e-06, "loss": 0.0028, "num_tokens": 2653227821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8645557736622003, "frac_reward_zero_std": 1.0, "grad_norm": 5.860773327802224e-10, "kl": 0.071044921875, "learning_rate": 1.1009152195311711e-06, "loss": 0.0028, "num_tokens": 2653792445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8647264658188957, "frac_reward_zero_std": 1.0, "grad_norm": 6.93647885535233e-10, "kl": 0.0740966796875, "learning_rate": 1.0981991747648912e-06, "loss": 0.003, "num_tokens": 2654354269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.864897157975591, "frac_reward_zero_std": 1.0, "grad_norm": 4.687658316399386e-10, "kl": 0.06787109375, "learning_rate": 1.0954862898165642e-06, "loss": 0.0027, "num_tokens": 2654918813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8650678501322864, "frac_reward_zero_std": 1.0, "grad_norm": 4.2892174319498134e-10, "kl": 0.0675048828125, "learning_rate": 1.0927765656491707e-06, "loss": 0.0027, "num_tokens": 2655486845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8652385422889818, "frac_reward_zero_std": 1.0, "grad_norm": 4.4296876227648193e-10, "kl": 0.06689453125, "learning_rate": 1.0900700032245659e-06, "loss": 0.0027, "num_tokens": 2656055469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8654092344456772, "frac_reward_zero_std": 1.0, "grad_norm": 4.0611157201465243e-10, "kl": 0.0701904296875, "learning_rate": 1.0873666035034802e-06, "loss": 0.0028, "num_tokens": 2656627933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8655799266023726, "frac_reward_zero_std": 1.0, "grad_norm": 7.734289714576584e-10, "kl": 0.0772705078125, "learning_rate": 1.0846663674455205e-06, "loss": 0.0031, "num_tokens": 2657191053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.865750618759068, "frac_reward_zero_std": 1.0, "grad_norm": 5.71332342038938e-10, "kl": 0.0716552734375, "learning_rate": 1.0819692960091755e-06, "loss": 0.0029, "num_tokens": 2657759645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8659213109157634, "frac_reward_zero_std": 1.0, "grad_norm": 4.3294960063481073e-10, "kl": 0.0716552734375, "learning_rate": 1.0792753901518093e-06, "loss": 0.0029, "num_tokens": 2658328029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8660920030724588, "frac_reward_zero_std": 1.0, "grad_norm": 6.650007412717904e-10, "kl": 0.0750732421875, "learning_rate": 1.0765846508296584e-06, "loss": 0.003, "num_tokens": 2658893901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8662626952291542, "frac_reward_zero_std": 1.0, "grad_norm": 5.646115970573719e-10, "kl": 0.0670166015625, "learning_rate": 1.0738970789978408e-06, "loss": 0.0027, "num_tokens": 2659460381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8664333873858496, "frac_reward_zero_std": 1.0, "grad_norm": 4.696087396523942e-10, "kl": 0.071044921875, "learning_rate": 1.0712126756103458e-06, "loss": 0.0028, "num_tokens": 2660027469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.866604079542545, "frac_reward_zero_std": 1.0, "grad_norm": 6.426515809019902e-10, "kl": 0.0694580078125, "learning_rate": 1.0685314416200421e-06, "loss": 0.0028, "num_tokens": 2660591805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8667747716992404, "frac_reward_zero_std": 1.0, "grad_norm": 4.950465921433527e-10, "kl": 0.0693359375, "learning_rate": 1.065853377978666e-06, "loss": 0.0028, "num_tokens": 2661159885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8669454638559358, "frac_reward_zero_std": 1.0, "grad_norm": 6.433730692985286e-10, "kl": 0.0716552734375, "learning_rate": 1.0631784856368388e-06, "loss": 0.0029, "num_tokens": 2661722813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8671161560126313, "frac_reward_zero_std": 1.0, "grad_norm": 6.536223305824615e-10, "kl": 0.0667724609375, "learning_rate": 1.060506765544047e-06, "loss": 0.0027, "num_tokens": 2662289165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8672868481693267, "frac_reward_zero_std": 1.0, "grad_norm": 6.894274034840537e-10, "kl": 0.0726318359375, "learning_rate": 1.057838218648658e-06, "loss": 0.0029, "num_tokens": 2662854077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8674575403260221, "frac_reward_zero_std": 1.0, "grad_norm": 5.45537936211458e-10, "kl": 0.0670166015625, "learning_rate": 1.055172845897906e-06, "loss": 0.0027, "num_tokens": 2663425181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8676282324827174, "frac_reward_zero_std": 1.0, "grad_norm": 7.535957526516168e-10, "kl": 0.0697021484375, "learning_rate": 1.0525106482379065e-06, "loss": 0.0028, "num_tokens": 2663996781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8677989246394128, "frac_reward_zero_std": 1.0, "grad_norm": 4.6237770228837423e-10, "kl": 0.0692138671875, "learning_rate": 1.0498516266136383e-06, "loss": 0.0028, "num_tokens": 2664571341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8679696167961082, "frac_reward_zero_std": 1.0, "grad_norm": 3.395591820364915e-10, "kl": 0.0701904296875, "learning_rate": 1.0471957819689627e-06, "loss": 0.0028, "num_tokens": 2665144173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8681403089528036, "frac_reward_zero_std": 1.0, "grad_norm": 6.160230818611756e-10, "kl": 0.07763671875, "learning_rate": 1.0445431152466057e-06, "loss": 0.0031, "num_tokens": 2665713005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.868311001109499, "frac_reward_zero_std": 1.0, "grad_norm": 3.737049416517024e-10, "kl": 0.06640625, "learning_rate": 1.0418936273881708e-06, "loss": 0.0027, "num_tokens": 2666282893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8684816932661944, "frac_reward_zero_std": 1.0, "grad_norm": 4.285719307073235e-10, "kl": 0.075927734375, "learning_rate": 1.0392473193341267e-06, "loss": 0.003, "num_tokens": 2666849165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8686523854228898, "frac_reward_zero_std": 1.0, "grad_norm": 5.99163859006921e-10, "kl": 0.069091796875, "learning_rate": 1.0366041920238223e-06, "loss": 0.0028, "num_tokens": 2667414349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8688230775795852, "frac_reward_zero_std": 1.0, "grad_norm": 6.352775881572995e-10, "kl": 0.071044921875, "learning_rate": 1.033964246395468e-06, "loss": 0.0028, "num_tokens": 2667974493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8689937697362806, "frac_reward_zero_std": 1.0, "grad_norm": 3.771895030295704e-10, "kl": 0.070556640625, "learning_rate": 1.031327483386152e-06, "loss": 0.0028, "num_tokens": 2668547021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.869164461892976, "frac_reward_zero_std": 1.0, "grad_norm": 4.168438970437102e-10, "kl": 0.0704345703125, "learning_rate": 1.0286939039318277e-06, "loss": 0.0028, "num_tokens": 2669115277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8693351540496714, "frac_reward_zero_std": 1.0, "grad_norm": 4.019573185380421e-10, "kl": 0.0693359375, "learning_rate": 1.0260635089673233e-06, "loss": 0.0028, "num_tokens": 2669687485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8695058462063668, "frac_reward_zero_std": 1.0, "grad_norm": 6.124417932337215e-10, "kl": 0.0733642578125, "learning_rate": 1.0234362994263315e-06, "loss": 0.0029, "num_tokens": 2670249709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8696765383630622, "frac_reward_zero_std": 1.0, "grad_norm": 5.914547306781209e-10, "kl": 0.07275390625, "learning_rate": 1.0208122762414174e-06, "loss": 0.0029, "num_tokens": 2670813021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8698472305197577, "frac_reward_zero_std": 1.0, "grad_norm": 4.3808997318709945e-10, "kl": 0.0660400390625, "learning_rate": 1.0181914403440175e-06, "loss": 0.0026, "num_tokens": 2671382621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8700179226764531, "frac_reward_zero_std": 1.0, "grad_norm": 3.8366822413211903e-10, "kl": 0.06884765625, "learning_rate": 1.0155737926644305e-06, "loss": 0.0028, "num_tokens": 2671952461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8701886148331485, "frac_reward_zero_std": 1.0, "grad_norm": 4.0577199717051126e-10, "kl": 0.067626953125, "learning_rate": 1.0129593341318267e-06, "loss": 0.0027, "num_tokens": 2672520541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8703593069898438, "frac_reward_zero_std": 1.0, "grad_norm": 5.297764983126028e-10, "kl": 0.0712890625, "learning_rate": 1.010348065674246e-06, "loss": 0.0028, "num_tokens": 2673089293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8705299991465392, "frac_reward_zero_std": 1.0, "grad_norm": 5.609708931038892e-10, "kl": 0.0697021484375, "learning_rate": 1.0077399882185967e-06, "loss": 0.0028, "num_tokens": 2673654733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8707006913032346, "frac_reward_zero_std": 1.0, "grad_norm": 7.815123104462205e-10, "kl": 0.0692138671875, "learning_rate": 1.0051351026906486e-06, "loss": 0.0028, "num_tokens": 2674214349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.87087138345993, "frac_reward_zero_std": 1.0, "grad_norm": 7.680625035192338e-10, "kl": 0.0723876953125, "learning_rate": 1.0025334100150418e-06, "loss": 0.0029, "num_tokens": 2674773453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8710420756166254, "frac_reward_zero_std": 1.0, "grad_norm": 5.950923834353656e-10, "kl": 0.0697021484375, "learning_rate": 9.999349111152844e-07, "loss": 0.0028, "num_tokens": 2675339101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8712127677733208, "frac_reward_zero_std": 1.0, "grad_norm": 4.620507561115336e-10, "kl": 0.0682373046875, "learning_rate": 9.973396069137519e-07, "loss": 0.0027, "num_tokens": 2675911901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8713834599300162, "frac_reward_zero_std": 1.0, "grad_norm": 5.83829325577257e-10, "kl": 0.0726318359375, "learning_rate": 9.947474983316806e-07, "loss": 0.0029, "num_tokens": 2676474237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8715541520867116, "frac_reward_zero_std": 1.0, "grad_norm": 3.820211839448641e-10, "kl": 0.06884765625, "learning_rate": 9.92158586289176e-07, "loss": 0.0028, "num_tokens": 2677043885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.871724844243407, "frac_reward_zero_std": 1.0, "grad_norm": 4.6303220990333694e-10, "kl": 0.0665283203125, "learning_rate": 9.895728717052078e-07, "loss": 0.0027, "num_tokens": 2677608765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8718955364001024, "frac_reward_zero_std": 1.0, "grad_norm": 3.510449459475213e-10, "kl": 0.0679931640625, "learning_rate": 9.869903554976146e-07, "loss": 0.0027, "num_tokens": 2678181773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8720662285567978, "frac_reward_zero_std": 1.0, "grad_norm": 4.538462754431246e-10, "kl": 0.06884765625, "learning_rate": 9.844110385830952e-07, "loss": 0.0028, "num_tokens": 2678748957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8722369207134932, "frac_reward_zero_std": 1.0, "grad_norm": 5.88498326293337e-10, "kl": 0.0726318359375, "learning_rate": 9.818349218772094e-07, "loss": 0.0029, "num_tokens": 2679318269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8724076128701886, "frac_reward_zero_std": 1.0, "grad_norm": 2.7475565671540356e-10, "kl": 0.0660400390625, "learning_rate": 9.792620062943902e-07, "loss": 0.0026, "num_tokens": 2679891133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.872578305026884, "frac_reward_zero_std": 1.0, "grad_norm": 7.094504498787063e-10, "kl": 0.071044921875, "learning_rate": 9.766922927479293e-07, "loss": 0.0028, "num_tokens": 2680458797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8727489971835795, "frac_reward_zero_std": 1.0, "grad_norm": 7.373910316131601e-10, "kl": 0.0687255859375, "learning_rate": 9.74125782149985e-07, "loss": 0.0028, "num_tokens": 2681020717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8729196893402749, "frac_reward_zero_std": 1.0, "grad_norm": 5.636413730649885e-10, "kl": 0.0704345703125, "learning_rate": 9.715624754115683e-07, "loss": 0.0028, "num_tokens": 2681586589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8730903814969703, "frac_reward_zero_std": 1.0, "grad_norm": 6.297490281765547e-10, "kl": 0.0738525390625, "learning_rate": 9.690023734425658e-07, "loss": 0.003, "num_tokens": 2682164477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8732610736536656, "frac_reward_zero_std": 1.0, "grad_norm": 7.675001295089886e-10, "kl": 0.0711669921875, "learning_rate": 9.664454771517206e-07, "loss": 0.0028, "num_tokens": 2682727309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.873431765810361, "frac_reward_zero_std": 1.0, "grad_norm": 5.119434830529464e-10, "kl": 0.0714111328125, "learning_rate": 9.638917874466392e-07, "loss": 0.0029, "num_tokens": 2683292813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8736024579670564, "frac_reward_zero_std": 1.0, "grad_norm": 6.039499797589394e-10, "kl": 0.0693359375, "learning_rate": 9.613413052337894e-07, "loss": 0.0028, "num_tokens": 2683858637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8737731501237518, "frac_reward_zero_std": 1.0, "grad_norm": 5.761873240410759e-10, "kl": 0.0726318359375, "learning_rate": 9.587940314184973e-07, "loss": 0.0029, "num_tokens": 2684427357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8739438422804472, "frac_reward_zero_std": 1.0, "grad_norm": 5.894597614051117e-10, "kl": 0.0684814453125, "learning_rate": 9.562499669049552e-07, "loss": 0.0027, "num_tokens": 2684994045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8741145344371426, "frac_reward_zero_std": 1.0, "grad_norm": 5.757748139485298e-10, "kl": 0.06884765625, "learning_rate": 9.537091125962162e-07, "loss": 0.0027, "num_tokens": 2685557037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.874285226593838, "frac_reward_zero_std": 1.0, "grad_norm": 5.655929092709725e-10, "kl": 0.0721435546875, "learning_rate": 9.511714693941898e-07, "loss": 0.0029, "num_tokens": 2686120477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8744559187505334, "frac_reward_zero_std": 1.0, "grad_norm": 6.043096726188256e-10, "kl": 0.0758056640625, "learning_rate": 9.486370381996457e-07, "loss": 0.003, "num_tokens": 2686687357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8746266109072288, "frac_reward_zero_std": 1.0, "grad_norm": 5.161009930709986e-10, "kl": 0.0740966796875, "learning_rate": 9.461058199122187e-07, "loss": 0.003, "num_tokens": 2687253245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8747973030639242, "frac_reward_zero_std": 1.0, "grad_norm": 5.480856597899608e-10, "kl": 0.068359375, "learning_rate": 9.435778154303998e-07, "loss": 0.0027, "num_tokens": 2687819405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8749679952206196, "frac_reward_zero_std": 1.0, "grad_norm": 5.668545364419717e-10, "kl": 0.0706787109375, "learning_rate": 9.410530256515382e-07, "loss": 0.0028, "num_tokens": 2688386157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.875138687377315, "frac_reward_zero_std": 1.0, "grad_norm": 3.498701549735795e-10, "kl": 0.066650390625, "learning_rate": 9.385314514718413e-07, "loss": 0.0027, "num_tokens": 2688966125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8753093795340104, "frac_reward_zero_std": 1.0, "grad_norm": 3.864865584124533e-10, "kl": 0.0701904296875, "learning_rate": 9.360130937863799e-07, "loss": 0.0028, "num_tokens": 2689539245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8754800716907059, "frac_reward_zero_std": 1.0, "grad_norm": 4.770112102979648e-10, "kl": 0.0704345703125, "learning_rate": 9.334979534890798e-07, "loss": 0.0028, "num_tokens": 2690106781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8756507638474013, "frac_reward_zero_std": 1.0, "grad_norm": 6.01254952212242e-10, "kl": 0.0692138671875, "learning_rate": 9.309860314727249e-07, "loss": 0.0028, "num_tokens": 2690669821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8758214560040967, "frac_reward_zero_std": 1.0, "grad_norm": 4.758537913679678e-10, "kl": 0.0699462890625, "learning_rate": 9.28477328628954e-07, "loss": 0.0028, "num_tokens": 2691235549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.875992148160792, "frac_reward_zero_std": 1.0, "grad_norm": 5.166326915097714e-10, "kl": 0.0694580078125, "learning_rate": 9.259718458482681e-07, "loss": 0.0028, "num_tokens": 2691804605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8761628403174874, "frac_reward_zero_std": 1.0, "grad_norm": 5.109194512126551e-10, "kl": 0.069091796875, "learning_rate": 9.234695840200259e-07, "loss": 0.0028, "num_tokens": 2692372557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8763335324741828, "frac_reward_zero_std": 1.0, "grad_norm": 5.741731171471282e-10, "kl": 0.0723876953125, "learning_rate": 9.209705440324368e-07, "loss": 0.0029, "num_tokens": 2692937693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8765042246308782, "frac_reward_zero_std": 1.0, "grad_norm": 6.050381452693011e-10, "kl": 0.070556640625, "learning_rate": 9.184747267725691e-07, "loss": 0.0028, "num_tokens": 2693500189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8766749167875736, "frac_reward_zero_std": 1.0, "grad_norm": 5.239933785098842e-10, "kl": 0.070556640625, "learning_rate": 9.159821331263497e-07, "loss": 0.0028, "num_tokens": 2694064333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.876845608944269, "frac_reward_zero_std": 1.0, "grad_norm": 7.249364097479915e-10, "kl": 0.072021484375, "learning_rate": 9.134927639785596e-07, "loss": 0.0029, "num_tokens": 2694624941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8770163011009644, "frac_reward_zero_std": 1.0, "grad_norm": 4.3453396564296204e-10, "kl": 0.071533203125, "learning_rate": 9.110066202128321e-07, "loss": 0.0029, "num_tokens": 2695189533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8771869932576598, "frac_reward_zero_std": 1.0, "grad_norm": 5.841159771013786e-10, "kl": 0.0699462890625, "learning_rate": 9.085237027116634e-07, "loss": 0.0028, "num_tokens": 2695757949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8773576854143552, "frac_reward_zero_std": 1.0, "grad_norm": 3.8996661895967525e-10, "kl": 0.0728759765625, "learning_rate": 9.060440123563941e-07, "loss": 0.0029, "num_tokens": 2696327405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8775283775710506, "frac_reward_zero_std": 1.0, "grad_norm": 5.864497336608251e-10, "kl": 0.0703125, "learning_rate": 9.0356755002723e-07, "loss": 0.0028, "num_tokens": 2696889661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.877699069727746, "frac_reward_zero_std": 1.0, "grad_norm": 8.786523148667644e-10, "kl": 0.0755615234375, "learning_rate": 9.010943166032216e-07, "loss": 0.003, "num_tokens": 2697450893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8778697618844414, "frac_reward_zero_std": 1.0, "grad_norm": 4.784833537797893e-10, "kl": 0.072021484375, "learning_rate": 8.986243129622807e-07, "loss": 0.0029, "num_tokens": 2698017917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8780404540411368, "frac_reward_zero_std": 1.0, "grad_norm": 5.997329902811739e-10, "kl": 0.0684814453125, "learning_rate": 8.961575399811661e-07, "loss": 0.0027, "num_tokens": 2698590493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8782111461978322, "frac_reward_zero_std": 1.0, "grad_norm": 6.549361252365891e-10, "kl": 0.072509765625, "learning_rate": 8.936939985354964e-07, "loss": 0.0029, "num_tokens": 2699156925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8783818383545277, "frac_reward_zero_std": 1.0, "grad_norm": 5.352843915199966e-10, "kl": 0.0693359375, "learning_rate": 8.912336894997365e-07, "loss": 0.0028, "num_tokens": 2699721757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8785525305112231, "frac_reward_zero_std": 1.0, "grad_norm": 8.450346958018048e-10, "kl": 0.074951171875, "learning_rate": 8.887766137472121e-07, "loss": 0.003, "num_tokens": 2700284989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8787232226679184, "frac_reward_zero_std": 1.0, "grad_norm": 6.236859839697023e-10, "kl": 0.072021484375, "learning_rate": 8.863227721500911e-07, "loss": 0.0029, "num_tokens": 2700860733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8788939148246138, "frac_reward_zero_std": 1.0, "grad_norm": 3.898708087616085e-10, "kl": 0.0687255859375, "learning_rate": 8.838721655794025e-07, "loss": 0.0028, "num_tokens": 2701427901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8790646069813092, "frac_reward_zero_std": 1.0, "grad_norm": 5.908147577135842e-10, "kl": 0.069091796875, "learning_rate": 8.814247949050203e-07, "loss": 0.0028, "num_tokens": 2701992157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8792352991380046, "frac_reward_zero_std": 1.0, "grad_norm": 6.857976369754459e-10, "kl": 0.0706787109375, "learning_rate": 8.789806609956763e-07, "loss": 0.0028, "num_tokens": 2702551805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8794059912947, "frac_reward_zero_std": 1.0, "grad_norm": 4.6385052230846613e-10, "kl": 0.066650390625, "learning_rate": 8.765397647189467e-07, "loss": 0.0027, "num_tokens": 2703120061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8795766834513954, "frac_reward_zero_std": 1.0, "grad_norm": 6.507201504489851e-10, "kl": 0.0682373046875, "learning_rate": 8.741021069412636e-07, "loss": 0.0027, "num_tokens": 2703681853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8797473756080908, "frac_reward_zero_std": 1.0, "grad_norm": 4.1573278223091226e-10, "kl": 0.068359375, "learning_rate": 8.71667688527904e-07, "loss": 0.0027, "num_tokens": 2704250285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8799180677647862, "frac_reward_zero_std": 1.0, "grad_norm": 4.1715320198623786e-10, "kl": 0.0654296875, "learning_rate": 8.692365103430022e-07, "loss": 0.0026, "num_tokens": 2704833981.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8800887599214816, "frac_reward_zero_std": 1.0, "grad_norm": 7.580482563976011e-10, "kl": 0.07568359375, "learning_rate": 8.668085732495391e-07, "loss": 0.003, "num_tokens": 2705402589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.880259452078177, "frac_reward_zero_std": 1.0, "grad_norm": 3.9512693915543244e-10, "kl": 0.0687255859375, "learning_rate": 8.643838781093427e-07, "loss": 0.0028, "num_tokens": 2705969901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8804301442348724, "frac_reward_zero_std": 1.0, "grad_norm": 5.684593287486332e-10, "kl": 0.06884765625, "learning_rate": 8.619624257830916e-07, "loss": 0.0028, "num_tokens": 2706533741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8806008363915678, "frac_reward_zero_std": 1.0, "grad_norm": 5.235681389184813e-10, "kl": 0.071533203125, "learning_rate": 8.595442171303147e-07, "loss": 0.0029, "num_tokens": 2707097597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8807715285482632, "frac_reward_zero_std": 1.0, "grad_norm": 5.735234270638093e-10, "kl": 0.07470703125, "learning_rate": 8.571292530093911e-07, "loss": 0.003, "num_tokens": 2707658717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8809422207049586, "frac_reward_zero_std": 1.0, "grad_norm": 5.595753241421594e-10, "kl": 0.0697021484375, "learning_rate": 8.547175342775449e-07, "loss": 0.0028, "num_tokens": 2708226605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.881112912861654, "frac_reward_zero_std": 1.0, "grad_norm": 4.102366353294428e-10, "kl": 0.065673828125, "learning_rate": 8.523090617908458e-07, "loss": 0.0026, "num_tokens": 2708801341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8812836050183495, "frac_reward_zero_std": 1.0, "grad_norm": 4.270923698717158e-10, "kl": 0.070556640625, "learning_rate": 8.499038364042167e-07, "loss": 0.0028, "num_tokens": 2709370189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8814542971750448, "frac_reward_zero_std": 1.0, "grad_norm": 4.887437172268447e-10, "kl": 0.0687255859375, "learning_rate": 8.475018589714279e-07, "loss": 0.0028, "num_tokens": 2709938301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8816249893317402, "frac_reward_zero_std": 1.0, "grad_norm": 5.847537391914263e-10, "kl": 0.0731201171875, "learning_rate": 8.45103130345094e-07, "loss": 0.0029, "num_tokens": 2710507101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8817956814884356, "frac_reward_zero_std": 1.0, "grad_norm": 6.257780738933044e-10, "kl": 0.0693359375, "learning_rate": 8.42707651376673e-07, "loss": 0.0028, "num_tokens": 2711079517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.881966373645131, "frac_reward_zero_std": 1.0, "grad_norm": 5.176311196635896e-10, "kl": 0.070556640625, "learning_rate": 8.403154229164779e-07, "loss": 0.0028, "num_tokens": 2711641469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8821370658018264, "frac_reward_zero_std": 1.0, "grad_norm": 5.845229242022043e-10, "kl": 0.072509765625, "learning_rate": 8.379264458136627e-07, "loss": 0.0029, "num_tokens": 2712210205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8823077579585218, "frac_reward_zero_std": 1.0, "grad_norm": 5.579770075581279e-10, "kl": 0.0723876953125, "learning_rate": 8.35540720916228e-07, "loss": 0.0029, "num_tokens": 2712770365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8824784501152172, "frac_reward_zero_std": 1.0, "grad_norm": 5.873423118541695e-10, "kl": 0.070068359375, "learning_rate": 8.331582490710166e-07, "loss": 0.0028, "num_tokens": 2713335661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8826491422719126, "frac_reward_zero_std": 1.0, "grad_norm": 5.138081794513786e-10, "kl": 0.071533203125, "learning_rate": 8.30779031123724e-07, "loss": 0.0029, "num_tokens": 2713898013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.882819834428608, "frac_reward_zero_std": 1.0, "grad_norm": 5.349550542211806e-10, "kl": 0.068359375, "learning_rate": 8.284030679188871e-07, "loss": 0.0027, "num_tokens": 2714463677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8829905265853034, "frac_reward_zero_std": 1.0, "grad_norm": 4.6388998465686307e-10, "kl": 0.064697265625, "learning_rate": 8.260303602998842e-07, "loss": 0.0026, "num_tokens": 2715030941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8831612187419988, "frac_reward_zero_std": 1.0, "grad_norm": 6.070576498102617e-10, "kl": 0.07177734375, "learning_rate": 8.23660909108942e-07, "loss": 0.0029, "num_tokens": 2715594445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8833319108986942, "frac_reward_zero_std": 1.0, "grad_norm": 4.374500282081417e-10, "kl": 0.068115234375, "learning_rate": 8.212947151871287e-07, "loss": 0.0027, "num_tokens": 2716159373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8835026030553896, "frac_reward_zero_std": 1.0, "grad_norm": 5.650228015287909e-10, "kl": 0.071044921875, "learning_rate": 8.189317793743623e-07, "loss": 0.0028, "num_tokens": 2716722765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.883673295212085, "frac_reward_zero_std": 1.0, "grad_norm": 5.926799576173489e-10, "kl": 0.0701904296875, "learning_rate": 8.165721025093965e-07, "loss": 0.0028, "num_tokens": 2717290509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8838439873687804, "frac_reward_zero_std": 1.0, "grad_norm": 6.458340228042303e-10, "kl": 0.07177734375, "learning_rate": 8.142156854298289e-07, "loss": 0.0029, "num_tokens": 2717860141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8840146795254759, "frac_reward_zero_std": 1.0, "grad_norm": 4.616713947677812e-10, "kl": 0.0687255859375, "learning_rate": 8.118625289721061e-07, "loss": 0.0028, "num_tokens": 2718424013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8841853716821712, "frac_reward_zero_std": 1.0, "grad_norm": 4.93058841558683e-10, "kl": 0.0736083984375, "learning_rate": 8.095126339715142e-07, "loss": 0.0029, "num_tokens": 2718998877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8843560638388666, "frac_reward_zero_std": 1.0, "grad_norm": 7.545295749769659e-10, "kl": 0.0711669921875, "learning_rate": 8.071660012621785e-07, "loss": 0.0029, "num_tokens": 2719560125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.884526755995562, "frac_reward_zero_std": 1.0, "grad_norm": 5.87136094381922e-10, "kl": 0.0701904296875, "learning_rate": 8.048226316770713e-07, "loss": 0.0028, "num_tokens": 2720122557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8846974481522574, "frac_reward_zero_std": 1.0, "grad_norm": 6.155465404414566e-10, "kl": 0.0697021484375, "learning_rate": 8.024825260480019e-07, "loss": 0.0028, "num_tokens": 2720688269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8848681403089528, "frac_reward_zero_std": 1.0, "grad_norm": 8.527571278309678e-10, "kl": 0.07763671875, "learning_rate": 8.001456852056255e-07, "loss": 0.0031, "num_tokens": 2721246045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8850388324656482, "frac_reward_zero_std": 1.0, "grad_norm": 5.681238338869961e-10, "kl": 0.081787109375, "learning_rate": 7.978121099794334e-07, "loss": 0.0033, "num_tokens": 2721811805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8852095246223436, "frac_reward_zero_std": 1.0, "grad_norm": 4.996634344672455e-10, "kl": 0.0714111328125, "learning_rate": 7.954818011977639e-07, "loss": 0.0029, "num_tokens": 2722376077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.885380216779039, "frac_reward_zero_std": 1.0, "grad_norm": 6.868866575591288e-10, "kl": 0.073486328125, "learning_rate": 7.931547596877898e-07, "loss": 0.0029, "num_tokens": 2722938637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8855509089357344, "frac_reward_zero_std": 1.0, "grad_norm": 5.060680657365581e-10, "kl": 0.06982421875, "learning_rate": 7.908309862755292e-07, "loss": 0.0028, "num_tokens": 2723503853.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8857216010924298, "frac_reward_zero_std": 1.0, "grad_norm": 5.678422453269095e-10, "kl": 0.0660400390625, "learning_rate": 7.885104817858358e-07, "loss": 0.0026, "num_tokens": 2724068813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8858922932491252, "frac_reward_zero_std": 1.0, "grad_norm": 7.738502001527349e-10, "kl": 0.0694580078125, "learning_rate": 7.861932470424061e-07, "loss": 0.0028, "num_tokens": 2724627213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8860629854058206, "frac_reward_zero_std": 1.0, "grad_norm": 5.163582460533003e-10, "kl": 0.0694580078125, "learning_rate": 7.838792828677732e-07, "loss": 0.0028, "num_tokens": 2725192157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.886233677562516, "frac_reward_zero_std": 1.0, "grad_norm": 7.834330029747742e-10, "kl": 0.06982421875, "learning_rate": 7.815685900833147e-07, "loss": 0.0028, "num_tokens": 2725754029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8864043697192114, "frac_reward_zero_std": 1.0, "grad_norm": 5.29222430628753e-10, "kl": 0.0693359375, "learning_rate": 7.792611695092378e-07, "loss": 0.0028, "num_tokens": 2726322861.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8865750618759068, "frac_reward_zero_std": 1.0, "grad_norm": 6.547368832579211e-10, "kl": 0.06982421875, "learning_rate": 7.769570219645984e-07, "loss": 0.0028, "num_tokens": 2726884909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8867457540326023, "frac_reward_zero_std": 1.0, "grad_norm": 7.11610742392024e-10, "kl": 0.0706787109375, "learning_rate": 7.74656148267281e-07, "loss": 0.0028, "num_tokens": 2727447501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8869164461892975, "frac_reward_zero_std": 1.0, "grad_norm": 5.105773789587302e-10, "kl": 0.0709228515625, "learning_rate": 7.723585492340169e-07, "loss": 0.0028, "num_tokens": 2728013837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.887087138345993, "frac_reward_zero_std": 1.0, "grad_norm": 5.32118594988261e-10, "kl": 0.07080078125, "learning_rate": 7.700642256803681e-07, "loss": 0.0028, "num_tokens": 2728577213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8872578305026884, "frac_reward_zero_std": 1.0, "grad_norm": 4.902013014709148e-10, "kl": 0.0677490234375, "learning_rate": 7.677731784207377e-07, "loss": 0.0027, "num_tokens": 2729146013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8874285226593838, "frac_reward_zero_std": 1.0, "grad_norm": 5.396833898729902e-10, "kl": 0.070068359375, "learning_rate": 7.654854082683649e-07, "loss": 0.0028, "num_tokens": 2729710541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8875992148160792, "frac_reward_zero_std": 1.0, "grad_norm": 5.010630085502157e-10, "kl": 0.071044921875, "learning_rate": 7.632009160353271e-07, "loss": 0.0028, "num_tokens": 2730273677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8877699069727746, "frac_reward_zero_std": 1.0, "grad_norm": 5.577272093029856e-10, "kl": 0.070556640625, "learning_rate": 7.609197025325321e-07, "loss": 0.0028, "num_tokens": 2730838589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.88794059912947, "frac_reward_zero_std": 1.0, "grad_norm": 5.790407109716531e-10, "kl": 0.073974609375, "learning_rate": 7.586417685697311e-07, "loss": 0.003, "num_tokens": 2731401325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8881112912861654, "frac_reward_zero_std": 1.0, "grad_norm": 3.541690965454851e-10, "kl": 0.068115234375, "learning_rate": 7.563671149555108e-07, "loss": 0.0027, "num_tokens": 2731967773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8882819834428608, "frac_reward_zero_std": 1.0, "grad_norm": 5.504191000144451e-10, "kl": 0.072021484375, "learning_rate": 7.540957424972883e-07, "loss": 0.0029, "num_tokens": 2732530573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8884526755995562, "frac_reward_zero_std": 1.0, "grad_norm": 5.919900496945873e-10, "kl": 0.072265625, "learning_rate": 7.518276520013179e-07, "loss": 0.0029, "num_tokens": 2733095933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8886233677562516, "frac_reward_zero_std": 1.0, "grad_norm": 5.946830239473813e-10, "kl": 0.0716552734375, "learning_rate": 7.495628442726899e-07, "loss": 0.0029, "num_tokens": 2733659405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.888794059912947, "frac_reward_zero_std": 1.0, "grad_norm": 4.642483829860497e-10, "kl": 0.0745849609375, "learning_rate": 7.473013201153334e-07, "loss": 0.003, "num_tokens": 2734223261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8889647520696424, "frac_reward_zero_std": 1.0, "grad_norm": 5.893856166037964e-10, "kl": 0.072998046875, "learning_rate": 7.450430803320052e-07, "loss": 0.0029, "num_tokens": 2734785741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8891354442263378, "frac_reward_zero_std": 1.0, "grad_norm": 5.918181006823653e-10, "kl": 0.0709228515625, "learning_rate": 7.427881257242964e-07, "loss": 0.0028, "num_tokens": 2735352509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8893061363830332, "frac_reward_zero_std": 1.0, "grad_norm": 5.498830325845635e-10, "kl": 0.0689697265625, "learning_rate": 7.405364570926376e-07, "loss": 0.0028, "num_tokens": 2735914893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8894768285397286, "frac_reward_zero_std": 1.0, "grad_norm": 7.003806237082288e-10, "kl": 0.0718994140625, "learning_rate": 7.382880752362897e-07, "loss": 0.0029, "num_tokens": 2736476157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.889647520696424, "frac_reward_zero_std": 1.0, "grad_norm": 4.736340032018806e-10, "kl": 0.0733642578125, "learning_rate": 7.360429809533465e-07, "loss": 0.0029, "num_tokens": 2737041309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8898182128531194, "frac_reward_zero_std": 1.0, "grad_norm": 5.967032330422782e-10, "kl": 0.072998046875, "learning_rate": 7.338011750407348e-07, "loss": 0.0029, "num_tokens": 2737603581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8899889050098148, "frac_reward_zero_std": 1.0, "grad_norm": 6.164503720715378e-10, "kl": 0.0701904296875, "learning_rate": 7.315626582942148e-07, "loss": 0.0028, "num_tokens": 2738169549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8901595971665102, "frac_reward_zero_std": 1.0, "grad_norm": 4.883658825478751e-10, "kl": 0.071533203125, "learning_rate": 7.293274315083798e-07, "loss": 0.0029, "num_tokens": 2738738301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8903302893232056, "frac_reward_zero_std": 1.0, "grad_norm": 6.460057258220573e-10, "kl": 0.0692138671875, "learning_rate": 7.270954954766574e-07, "loss": 0.0028, "num_tokens": 2739301021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.890500981479901, "frac_reward_zero_std": 1.0, "grad_norm": 6.537908343807656e-10, "kl": 0.072265625, "learning_rate": 7.248668509912992e-07, "loss": 0.0029, "num_tokens": 2739862797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8906716736365964, "frac_reward_zero_std": 1.0, "grad_norm": 5.908417074187849e-10, "kl": 0.0711669921875, "learning_rate": 7.22641498843395e-07, "loss": 0.0028, "num_tokens": 2740428813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8908423657932918, "frac_reward_zero_std": 1.0, "grad_norm": 7.936376354126572e-10, "kl": 0.0721435546875, "learning_rate": 7.204194398228658e-07, "loss": 0.0029, "num_tokens": 2740990941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8910130579499872, "frac_reward_zero_std": 1.0, "grad_norm": 4.693330719031379e-10, "kl": 0.068115234375, "learning_rate": 7.182006747184634e-07, "loss": 0.0027, "num_tokens": 2741556733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8911837501066826, "frac_reward_zero_std": 1.0, "grad_norm": 3.8676759147441715e-10, "kl": 0.0689697265625, "learning_rate": 7.159852043177673e-07, "loss": 0.0028, "num_tokens": 2742129453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.891354442263378, "frac_reward_zero_std": 1.0, "grad_norm": 6.130746446732598e-10, "kl": 0.0694580078125, "learning_rate": 7.137730294071887e-07, "loss": 0.0028, "num_tokens": 2742693629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8915251344200734, "frac_reward_zero_std": 1.0, "grad_norm": 6.885883686899123e-10, "kl": 0.0697021484375, "learning_rate": 7.115641507719707e-07, "loss": 0.0028, "num_tokens": 2743255133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8916958265767688, "frac_reward_zero_std": 1.0, "grad_norm": 5.200756202240513e-10, "kl": 0.0714111328125, "learning_rate": 7.09358569196188e-07, "loss": 0.0029, "num_tokens": 2743820365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8918665187334642, "frac_reward_zero_std": 1.0, "grad_norm": 4.2494640370235145e-10, "kl": 0.0697021484375, "learning_rate": 7.0715628546274e-07, "loss": 0.0028, "num_tokens": 2744387821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8920372108901596, "frac_reward_zero_std": 1.0, "grad_norm": 4.4996350573974056e-10, "kl": 0.0709228515625, "learning_rate": 7.049573003533572e-07, "loss": 0.0028, "num_tokens": 2744958125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.892207903046855, "frac_reward_zero_std": 1.0, "grad_norm": 5.26274102835755e-10, "kl": 0.070068359375, "learning_rate": 7.027616146486005e-07, "loss": 0.0028, "num_tokens": 2745525757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8923785952035505, "frac_reward_zero_std": 1.0, "grad_norm": 6.772995785122574e-10, "kl": 0.071044921875, "learning_rate": 7.00569229127861e-07, "loss": 0.0028, "num_tokens": 2746088013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8925492873602457, "frac_reward_zero_std": 1.0, "grad_norm": 5.376656751265408e-10, "kl": 0.072021484375, "learning_rate": 6.983801445693561e-07, "loss": 0.0029, "num_tokens": 2746653517.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8927199795169412, "frac_reward_zero_std": 1.0, "grad_norm": 6.226744746606546e-10, "kl": 0.073974609375, "learning_rate": 6.961943617501277e-07, "loss": 0.003, "num_tokens": 2747218461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8928906716736366, "frac_reward_zero_std": 1.0, "grad_norm": 3.792103835035525e-10, "kl": 0.0704345703125, "learning_rate": 6.94011881446054e-07, "loss": 0.0028, "num_tokens": 2747789469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.893061363830332, "frac_reward_zero_std": 1.0, "grad_norm": 4.417667050473081e-10, "kl": 0.07177734375, "learning_rate": 6.918327044318363e-07, "loss": 0.0029, "num_tokens": 2748355773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8932320559870274, "frac_reward_zero_std": 1.0, "grad_norm": 6.141677202194072e-10, "kl": 0.0697021484375, "learning_rate": 6.896568314810026e-07, "loss": 0.0028, "num_tokens": 2748922669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8934027481437228, "frac_reward_zero_std": 1.0, "grad_norm": 6.66789730147821e-10, "kl": 0.0714111328125, "learning_rate": 6.874842633659085e-07, "loss": 0.0029, "num_tokens": 2749484813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8935734403004182, "frac_reward_zero_std": 1.0, "grad_norm": 6.197508922572561e-10, "kl": 0.0679931640625, "learning_rate": 6.853150008577391e-07, "loss": 0.0027, "num_tokens": 2750049837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8937441324571136, "frac_reward_zero_std": 1.0, "grad_norm": 4.1208535966928107e-10, "kl": 0.070556640625, "learning_rate": 6.831490447265043e-07, "loss": 0.0028, "num_tokens": 2750618813.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.893914824613809, "frac_reward_zero_std": 1.0, "grad_norm": 6.51993368837275e-10, "kl": 0.0726318359375, "learning_rate": 6.809863957410401e-07, "loss": 0.0029, "num_tokens": 2751179405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8940855167705044, "frac_reward_zero_std": 1.0, "grad_norm": 3.9032685272297173e-10, "kl": 0.070556640625, "learning_rate": 6.788270546690068e-07, "loss": 0.0028, "num_tokens": 2751754637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8942562089271998, "frac_reward_zero_std": 1.0, "grad_norm": 5.989594094030143e-10, "kl": 0.0711669921875, "learning_rate": 6.766710222768935e-07, "loss": 0.0028, "num_tokens": 2752319901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8944269010838952, "frac_reward_zero_std": 1.0, "grad_norm": 5.606678853532073e-10, "kl": 0.0679931640625, "learning_rate": 6.745182993300158e-07, "loss": 0.0027, "num_tokens": 2752893261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8945975932405906, "frac_reward_zero_std": 1.0, "grad_norm": 5.052208799330038e-10, "kl": 0.0738525390625, "learning_rate": 6.723688865925105e-07, "loss": 0.003, "num_tokens": 2753459661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.894768285397286, "frac_reward_zero_std": 1.0, "grad_norm": 3.904494579334741e-10, "kl": 0.06884765625, "learning_rate": 6.702227848273435e-07, "loss": 0.0028, "num_tokens": 2754027181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8949389775539814, "frac_reward_zero_std": 1.0, "grad_norm": 4.967336519664457e-10, "kl": 0.0692138671875, "learning_rate": 6.680799947963012e-07, "loss": 0.0028, "num_tokens": 2754598365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8951096697106768, "frac_reward_zero_std": 1.0, "grad_norm": 4.584522603692388e-10, "kl": 0.0711669921875, "learning_rate": 6.659405172599986e-07, "loss": 0.0028, "num_tokens": 2755165469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8952803618673721, "frac_reward_zero_std": 1.0, "grad_norm": 6.844037925524884e-10, "kl": 0.072509765625, "learning_rate": 6.638043529778715e-07, "loss": 0.0029, "num_tokens": 2755729293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8954510540240675, "frac_reward_zero_std": 1.0, "grad_norm": 6.464340496608539e-10, "kl": 0.07275390625, "learning_rate": 6.616715027081833e-07, "loss": 0.0029, "num_tokens": 2756295805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.895621746180763, "frac_reward_zero_std": 1.0, "grad_norm": 6.407929424774895e-10, "kl": 0.0709228515625, "learning_rate": 6.59541967208015e-07, "loss": 0.0028, "num_tokens": 2756857933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8957924383374584, "frac_reward_zero_std": 1.0, "grad_norm": 4.520672143533499e-10, "kl": 0.072509765625, "learning_rate": 6.574157472332787e-07, "loss": 0.0029, "num_tokens": 2757424157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8959631304941538, "frac_reward_zero_std": 1.0, "grad_norm": 5.813847093972472e-10, "kl": 0.0731201171875, "learning_rate": 6.55292843538703e-07, "loss": 0.0029, "num_tokens": 2757990797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8961338226508492, "frac_reward_zero_std": 1.0, "grad_norm": 5.73498438164367e-10, "kl": 0.06884765625, "learning_rate": 6.531732568778449e-07, "loss": 0.0028, "num_tokens": 2758555325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8963045148075446, "frac_reward_zero_std": 1.0, "grad_norm": 5.671067861288347e-10, "kl": 0.0697021484375, "learning_rate": 6.510569880030782e-07, "loss": 0.0028, "num_tokens": 2759120477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.89647520696424, "frac_reward_zero_std": 1.0, "grad_norm": 3.8368350079525355e-10, "kl": 0.070556640625, "learning_rate": 6.489440376656042e-07, "loss": 0.0028, "num_tokens": 2759693325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8966458991209354, "frac_reward_zero_std": 1.0, "grad_norm": 6.850931082992433e-10, "kl": 0.0751953125, "learning_rate": 6.468344066154419e-07, "loss": 0.003, "num_tokens": 2760257901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8968165912776308, "frac_reward_zero_std": 1.0, "grad_norm": 4.683531630004796e-10, "kl": 0.07421875, "learning_rate": 6.447280956014368e-07, "loss": 0.003, "num_tokens": 2760822397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8969872834343262, "frac_reward_zero_std": 1.0, "grad_norm": 4.961938268417282e-10, "kl": 0.071044921875, "learning_rate": 6.426251053712506e-07, "loss": 0.0028, "num_tokens": 2761389245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8971579755910216, "frac_reward_zero_std": 1.0, "grad_norm": 6.417801604123229e-10, "kl": 0.07080078125, "learning_rate": 6.40525436671372e-07, "loss": 0.0028, "num_tokens": 2761950173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.897328667747717, "frac_reward_zero_std": 1.0, "grad_norm": 6.290719513236627e-10, "kl": 0.07080078125, "learning_rate": 6.384290902471035e-07, "loss": 0.0028, "num_tokens": 2762514221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8974993599044124, "frac_reward_zero_std": 1.0, "grad_norm": 5.751853057546611e-10, "kl": 0.0701904296875, "learning_rate": 6.363360668425744e-07, "loss": 0.0028, "num_tokens": 2763075453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8976700520611078, "frac_reward_zero_std": 1.0, "grad_norm": 3.660259698926998e-10, "kl": 0.06591796875, "learning_rate": 6.342463672007348e-07, "loss": 0.0026, "num_tokens": 2763645917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8978407442178032, "frac_reward_zero_std": 1.0, "grad_norm": 4.596898932522374e-10, "kl": 0.069091796875, "learning_rate": 6.321599920633503e-07, "loss": 0.0028, "num_tokens": 2764213421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8980114363744985, "frac_reward_zero_std": 1.0, "grad_norm": 5.83236248426858e-10, "kl": 0.0728759765625, "learning_rate": 6.300769421710084e-07, "loss": 0.0029, "num_tokens": 2764778653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.898182128531194, "frac_reward_zero_std": 1.0, "grad_norm": 5.475574683696597e-10, "kl": 0.0711669921875, "learning_rate": 6.279972182631166e-07, "loss": 0.0028, "num_tokens": 2765343021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8983528206878894, "frac_reward_zero_std": 1.0, "grad_norm": 3.3806329978756713e-10, "kl": 0.0660400390625, "learning_rate": 6.259208210779033e-07, "loss": 0.0026, "num_tokens": 2765912637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8985235128445848, "frac_reward_zero_std": 1.0, "grad_norm": 5.21444977607268e-10, "kl": 0.0716552734375, "learning_rate": 6.238477513524144e-07, "loss": 0.0029, "num_tokens": 2766476493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8986942050012802, "frac_reward_zero_std": 1.0, "grad_norm": 4.243411093977623e-10, "kl": 0.0673828125, "learning_rate": 6.217780098225124e-07, "loss": 0.0027, "num_tokens": 2767045149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8988648971579756, "frac_reward_zero_std": 1.0, "grad_norm": 5.096573678280769e-10, "kl": 0.0701904296875, "learning_rate": 6.197115972228829e-07, "loss": 0.0028, "num_tokens": 2767609021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.899035589314671, "frac_reward_zero_std": 1.0, "grad_norm": 5.69834885160525e-10, "kl": 0.067138671875, "learning_rate": 6.176485142870281e-07, "loss": 0.0027, "num_tokens": 2768171149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8992062814713664, "frac_reward_zero_std": 1.0, "grad_norm": 5.700827544355635e-10, "kl": 0.0736083984375, "learning_rate": 6.155887617472678e-07, "loss": 0.0029, "num_tokens": 2768738429.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8993769736280618, "frac_reward_zero_std": 1.0, "grad_norm": 6.680695661331184e-10, "kl": 0.070068359375, "learning_rate": 6.135323403347381e-07, "loss": 0.0028, "num_tokens": 2769304461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8995476657847572, "frac_reward_zero_std": 1.0, "grad_norm": 6.36978944021202e-10, "kl": 0.078125, "learning_rate": 6.114792507793954e-07, "loss": 0.0031, "num_tokens": 2769866749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8997183579414526, "frac_reward_zero_std": 1.0, "grad_norm": 3.4883191555871054e-10, "kl": 0.0718994140625, "learning_rate": 6.094294938100143e-07, "loss": 0.0029, "num_tokens": 2770439293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.899889050098148, "frac_reward_zero_std": 1.0, "grad_norm": 5.811263337182997e-10, "kl": 0.071533203125, "learning_rate": 6.073830701541816e-07, "loss": 0.0029, "num_tokens": 2771003677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9000597422548434, "frac_reward_zero_std": 1.0, "grad_norm": 3.966499102767129e-10, "kl": 0.0673828125, "learning_rate": 6.053399805383042e-07, "loss": 0.0027, "num_tokens": 2771570589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9002304344115388, "frac_reward_zero_std": 1.0, "grad_norm": 3.6561992323913907e-10, "kl": 0.07177734375, "learning_rate": 6.033002256876064e-07, "loss": 0.0029, "num_tokens": 2772139453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9004011265682342, "frac_reward_zero_std": 1.0, "grad_norm": 3.9899264320248977e-10, "kl": 0.0682373046875, "learning_rate": 6.012638063261278e-07, "loss": 0.0027, "num_tokens": 2772707341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9005718187249296, "frac_reward_zero_std": 1.0, "grad_norm": 7.280700214528141e-10, "kl": 0.07275390625, "learning_rate": 5.992307231767236e-07, "loss": 0.0029, "num_tokens": 2773272397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9007425108816249, "frac_reward_zero_std": 1.0, "grad_norm": 3.8136228415099636e-10, "kl": 0.068359375, "learning_rate": 5.972009769610632e-07, "loss": 0.0027, "num_tokens": 2773844461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9009132030383203, "frac_reward_zero_std": 1.0, "grad_norm": 6.739714050070917e-10, "kl": 0.0733642578125, "learning_rate": 5.951745683996335e-07, "loss": 0.0029, "num_tokens": 2774408125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9010838951950157, "frac_reward_zero_std": 1.0, "grad_norm": 6.761121543800135e-10, "kl": 0.0701904296875, "learning_rate": 5.931514982117381e-07, "loss": 0.0028, "num_tokens": 2774975037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9012545873517112, "frac_reward_zero_std": 1.0, "grad_norm": 5.620589740519547e-10, "kl": 0.06787109375, "learning_rate": 5.911317671154959e-07, "loss": 0.0027, "num_tokens": 2775550957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9014252795084066, "frac_reward_zero_std": 1.0, "grad_norm": 5.110530099751538e-10, "kl": 0.0743408203125, "learning_rate": 5.891153758278323e-07, "loss": 0.003, "num_tokens": 2776118077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.901595971665102, "frac_reward_zero_std": 1.0, "grad_norm": 7.562609970483021e-10, "kl": 0.0753173828125, "learning_rate": 5.871023250644981e-07, "loss": 0.003, "num_tokens": 2776680397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9017666638217974, "frac_reward_zero_std": 1.0, "grad_norm": 7.188372313012513e-10, "kl": 0.0753173828125, "learning_rate": 5.850926155400504e-07, "loss": 0.003, "num_tokens": 2777248509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9019373559784928, "frac_reward_zero_std": 1.0, "grad_norm": 5.817391150922484e-10, "kl": 0.0697021484375, "learning_rate": 5.830862479678689e-07, "loss": 0.0028, "num_tokens": 2777813373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9021080481351882, "frac_reward_zero_std": 1.0, "grad_norm": 4.6407559354944195e-10, "kl": 0.0679931640625, "learning_rate": 5.810832230601371e-07, "loss": 0.0027, "num_tokens": 2778389181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9022787402918836, "frac_reward_zero_std": 1.0, "grad_norm": 4.988563867238052e-10, "kl": 0.0679931640625, "learning_rate": 5.790835415278573e-07, "loss": 0.0027, "num_tokens": 2778955021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.902449432448579, "frac_reward_zero_std": 1.0, "grad_norm": 4.5080664347338525e-10, "kl": 0.0687255859375, "learning_rate": 5.770872040808439e-07, "loss": 0.0028, "num_tokens": 2779522605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9026201246052744, "frac_reward_zero_std": 1.0, "grad_norm": 5.299441988277069e-10, "kl": 0.0706787109375, "learning_rate": 5.750942114277269e-07, "loss": 0.0028, "num_tokens": 2780090205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9027908167619698, "frac_reward_zero_std": 1.0, "grad_norm": 5.468428348764073e-10, "kl": 0.0703125, "learning_rate": 5.731045642759459e-07, "loss": 0.0028, "num_tokens": 2780654765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9029615089186652, "frac_reward_zero_std": 1.0, "grad_norm": 4.868204253689985e-10, "kl": 0.0716552734375, "learning_rate": 5.711182633317514e-07, "loss": 0.0029, "num_tokens": 2781218541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9031322010753606, "frac_reward_zero_std": 1.0, "grad_norm": 3.6311490049074104e-10, "kl": 0.07275390625, "learning_rate": 5.691353093002106e-07, "loss": 0.0029, "num_tokens": 2781787037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.903302893232056, "frac_reward_zero_std": 1.0, "grad_norm": 4.947922244775131e-10, "kl": 0.072509765625, "learning_rate": 5.671557028852015e-07, "loss": 0.0029, "num_tokens": 2782359549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9034735853887513, "frac_reward_zero_std": 1.0, "grad_norm": 7.168483611664327e-10, "kl": 0.0738525390625, "learning_rate": 5.65179444789411e-07, "loss": 0.0029, "num_tokens": 2782923021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9036442775454467, "frac_reward_zero_std": 1.0, "grad_norm": 5.222575896587483e-10, "kl": 0.0721435546875, "learning_rate": 5.632065357143401e-07, "loss": 0.0029, "num_tokens": 2783490269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9038149697021421, "frac_reward_zero_std": 1.0, "grad_norm": 5.459752403181229e-10, "kl": 0.0672607421875, "learning_rate": 5.612369763602987e-07, "loss": 0.0027, "num_tokens": 2784061613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9039856618588376, "frac_reward_zero_std": 1.0, "grad_norm": 5.060383059103282e-10, "kl": 0.068115234375, "learning_rate": 5.592707674264142e-07, "loss": 0.0027, "num_tokens": 2784626989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.904156354015533, "frac_reward_zero_std": 1.0, "grad_norm": 5.05183862919253e-10, "kl": 0.0703125, "learning_rate": 5.573079096106149e-07, "loss": 0.0028, "num_tokens": 2785196029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9043270461722284, "frac_reward_zero_std": 1.0, "grad_norm": 4.896839424170617e-10, "kl": 0.06591796875, "learning_rate": 5.553484036096458e-07, "loss": 0.0026, "num_tokens": 2785763341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9044977383289238, "frac_reward_zero_std": 1.0, "grad_norm": 5.915962227851516e-10, "kl": 0.0712890625, "learning_rate": 5.533922501190602e-07, "loss": 0.0029, "num_tokens": 2786326589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9046684304856192, "frac_reward_zero_std": 1.0, "grad_norm": 6.083807466642312e-10, "kl": 0.0716552734375, "learning_rate": 5.514394498332254e-07, "loss": 0.0029, "num_tokens": 2786890557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9048391226423146, "frac_reward_zero_std": 1.0, "grad_norm": 7.186563561268335e-10, "kl": 0.0712890625, "learning_rate": 5.494900034453099e-07, "loss": 0.0029, "num_tokens": 2787450909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.90500981479901, "frac_reward_zero_std": 1.0, "grad_norm": 5.508487588495072e-10, "kl": 0.0687255859375, "learning_rate": 5.475439116473014e-07, "loss": 0.0027, "num_tokens": 2788016157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9051805069557054, "frac_reward_zero_std": 1.0, "grad_norm": 6.436528208056187e-10, "kl": 0.07763671875, "learning_rate": 5.456011751299884e-07, "loss": 0.0031, "num_tokens": 2788579485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9053511991124008, "frac_reward_zero_std": 1.0, "grad_norm": 3.6251408201446536e-10, "kl": 0.0704345703125, "learning_rate": 5.436617945829758e-07, "loss": 0.0028, "num_tokens": 2789147933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9055218912690962, "frac_reward_zero_std": 1.0, "grad_norm": 4.546427495889615e-10, "kl": 0.06884765625, "learning_rate": 5.4172577069467e-07, "loss": 0.0028, "num_tokens": 2789712461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9056925834257916, "frac_reward_zero_std": 1.0, "grad_norm": 5.504929973234479e-10, "kl": 0.0697021484375, "learning_rate": 5.397931041522942e-07, "loss": 0.0028, "num_tokens": 2790278845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.905863275582487, "frac_reward_zero_std": 1.0, "grad_norm": 6.160261123031007e-10, "kl": 0.0694580078125, "learning_rate": 5.378637956418697e-07, "loss": 0.0028, "num_tokens": 2790840925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9060339677391824, "frac_reward_zero_std": 1.0, "grad_norm": 6.421902773742725e-10, "kl": 0.072265625, "learning_rate": 5.359378458482367e-07, "loss": 0.0029, "num_tokens": 2791400189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9062046598958778, "frac_reward_zero_std": 1.0, "grad_norm": 4.709502857790523e-10, "kl": 0.068359375, "learning_rate": 5.340152554550326e-07, "loss": 0.0027, "num_tokens": 2791972845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9063753520525731, "frac_reward_zero_std": 1.0, "grad_norm": 5.547395649870301e-10, "kl": 0.0701904296875, "learning_rate": 5.320960251447127e-07, "loss": 0.0028, "num_tokens": 2792536477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9065460442092685, "frac_reward_zero_std": 1.0, "grad_norm": 8.015739352483935e-10, "kl": 0.0762939453125, "learning_rate": 5.301801555985309e-07, "loss": 0.0031, "num_tokens": 2793097181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.906716736365964, "frac_reward_zero_std": 1.0, "grad_norm": 7.295026363680765e-10, "kl": 0.071533203125, "learning_rate": 5.282676474965543e-07, "loss": 0.0029, "num_tokens": 2793658541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9068874285226594, "frac_reward_zero_std": 1.0, "grad_norm": 7.649931908051059e-10, "kl": 0.0726318359375, "learning_rate": 5.26358501517652e-07, "loss": 0.0029, "num_tokens": 2794219709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9070581206793548, "frac_reward_zero_std": 1.0, "grad_norm": 4.511396246131521e-10, "kl": 0.0709228515625, "learning_rate": 5.244527183395032e-07, "loss": 0.0028, "num_tokens": 2794794317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9072288128360502, "frac_reward_zero_std": 1.0, "grad_norm": 5.441618222935786e-10, "kl": 0.0716552734375, "learning_rate": 5.225502986385911e-07, "loss": 0.0029, "num_tokens": 2795361005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9073995049927456, "frac_reward_zero_std": 1.0, "grad_norm": 7.420601984798555e-10, "kl": 0.073486328125, "learning_rate": 5.206512430902077e-07, "loss": 0.0029, "num_tokens": 2795925005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.907570197149441, "frac_reward_zero_std": 1.0, "grad_norm": 4.103767277882847e-10, "kl": 0.0673828125, "learning_rate": 5.187555523684473e-07, "loss": 0.0027, "num_tokens": 2796491677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9077408893061364, "frac_reward_zero_std": 1.0, "grad_norm": 5.623800289437345e-10, "kl": 0.071533203125, "learning_rate": 5.168632271462126e-07, "loss": 0.0029, "num_tokens": 2797055693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9079115814628318, "frac_reward_zero_std": 1.0, "grad_norm": 5.799352038536962e-10, "kl": 0.068359375, "learning_rate": 5.149742680952108e-07, "loss": 0.0027, "num_tokens": 2797620333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9080822736195272, "frac_reward_zero_std": 1.0, "grad_norm": 6.033827655109669e-10, "kl": 0.0714111328125, "learning_rate": 5.130886758859555e-07, "loss": 0.0029, "num_tokens": 2798181501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9082529657762226, "frac_reward_zero_std": 1.0, "grad_norm": 4.0824210823151393e-10, "kl": 0.067138671875, "learning_rate": 5.112064511877602e-07, "loss": 0.0027, "num_tokens": 2798749181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.908423657932918, "frac_reward_zero_std": 1.0, "grad_norm": 9.269633823197204e-10, "kl": 0.0765380859375, "learning_rate": 5.093275946687492e-07, "loss": 0.0031, "num_tokens": 2799308301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9085943500896134, "frac_reward_zero_std": 1.0, "grad_norm": 5.900905517486745e-10, "kl": 0.0728759765625, "learning_rate": 5.074521069958504e-07, "loss": 0.0029, "num_tokens": 2799879997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9087650422463088, "frac_reward_zero_std": 1.0, "grad_norm": 7.889712956021947e-10, "kl": 0.074462890625, "learning_rate": 5.055799888347912e-07, "loss": 0.003, "num_tokens": 2800442765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9089357344030042, "frac_reward_zero_std": 1.0, "grad_norm": 3.793459093918072e-10, "kl": 0.07177734375, "learning_rate": 5.037112408501066e-07, "loss": 0.0029, "num_tokens": 2801011773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9091064265596995, "frac_reward_zero_std": 1.0, "grad_norm": 5.588054543999344e-10, "kl": 0.071533203125, "learning_rate": 5.018458637051348e-07, "loss": 0.0029, "num_tokens": 2801576173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9092771187163949, "frac_reward_zero_std": 1.0, "grad_norm": 5.716989884330679e-10, "kl": 0.0738525390625, "learning_rate": 4.999838580620197e-07, "loss": 0.003, "num_tokens": 2802140045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9094478108730903, "frac_reward_zero_std": 1.0, "grad_norm": 7.083594893587118e-10, "kl": 0.071044921875, "learning_rate": 4.981252245817036e-07, "loss": 0.0028, "num_tokens": 2802702445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9096185030297858, "frac_reward_zero_std": 1.0, "grad_norm": 4.67900279466929e-10, "kl": 0.0716552734375, "learning_rate": 4.962699639239333e-07, "loss": 0.0029, "num_tokens": 2803269885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9097891951864812, "frac_reward_zero_std": 1.0, "grad_norm": 3.8446633401680803e-10, "kl": 0.0682373046875, "learning_rate": 4.944180767472606e-07, "loss": 0.0027, "num_tokens": 2803840493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9099598873431766, "frac_reward_zero_std": 1.0, "grad_norm": 6.072252867655174e-10, "kl": 0.069580078125, "learning_rate": 4.925695637090411e-07, "loss": 0.0028, "num_tokens": 2804403693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.910130579499872, "frac_reward_zero_std": 1.0, "grad_norm": 4.786779142261699e-10, "kl": 0.0712890625, "learning_rate": 4.907244254654264e-07, "loss": 0.0029, "num_tokens": 2804969213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9103012716565674, "frac_reward_zero_std": 1.0, "grad_norm": 6.006334717074719e-10, "kl": 0.0706787109375, "learning_rate": 4.888826626713761e-07, "loss": 0.0028, "num_tokens": 2805535229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9104719638132628, "frac_reward_zero_std": 1.0, "grad_norm": 7.426813197551962e-10, "kl": 0.07373046875, "learning_rate": 4.870442759806482e-07, "loss": 0.0029, "num_tokens": 2806095901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9106426559699582, "frac_reward_zero_std": 1.0, "grad_norm": 4.5791920466794197e-10, "kl": 0.0694580078125, "learning_rate": 4.852092660458052e-07, "loss": 0.0028, "num_tokens": 2806664157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9108133481266536, "frac_reward_zero_std": 1.0, "grad_norm": 4.177627658773372e-10, "kl": 0.0670166015625, "learning_rate": 4.833776335182084e-07, "loss": 0.0027, "num_tokens": 2807232797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.910984040283349, "frac_reward_zero_std": 1.0, "grad_norm": 3.8767378423343355e-10, "kl": 0.069580078125, "learning_rate": 4.815493790480208e-07, "loss": 0.0028, "num_tokens": 2807808349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9111547324400444, "frac_reward_zero_std": 1.0, "grad_norm": 5.167561330336515e-10, "kl": 0.06884765625, "learning_rate": 4.797245032842069e-07, "loss": 0.0028, "num_tokens": 2808376749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9113254245967398, "frac_reward_zero_std": 1.0, "grad_norm": 6.628973507610822e-10, "kl": 0.070556640625, "learning_rate": 4.779030068745327e-07, "loss": 0.0028, "num_tokens": 2808937069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9114961167534352, "frac_reward_zero_std": 1.0, "grad_norm": 4.5549896161958334e-10, "kl": 0.0703125, "learning_rate": 4.7608489046556463e-07, "loss": 0.0028, "num_tokens": 2809518589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9116668089101306, "frac_reward_zero_std": 1.0, "grad_norm": 6.760353323759279e-10, "kl": 0.0718994140625, "learning_rate": 4.7427015470266513e-07, "loss": 0.0029, "num_tokens": 2810080349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9118375010668259, "frac_reward_zero_std": 1.0, "grad_norm": 6.199613913247075e-10, "kl": 0.072509765625, "learning_rate": 4.7245880023000123e-07, "loss": 0.0029, "num_tokens": 2810648077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9120081932235213, "frac_reward_zero_std": 1.0, "grad_norm": 5.895901588879597e-10, "kl": 0.0709228515625, "learning_rate": 4.706508276905386e-07, "loss": 0.0028, "num_tokens": 2811211389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9121788853802167, "frac_reward_zero_std": 1.0, "grad_norm": 6.368597579053802e-10, "kl": 0.072998046875, "learning_rate": 4.6884623772604385e-07, "loss": 0.0029, "num_tokens": 2811776381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9123495775369121, "frac_reward_zero_std": 1.0, "grad_norm": 7.896216558364453e-10, "kl": 0.0753173828125, "learning_rate": 4.670450309770802e-07, "loss": 0.003, "num_tokens": 2812336429.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9125202696936076, "frac_reward_zero_std": 1.0, "grad_norm": 5.390339740826392e-10, "kl": 0.068115234375, "learning_rate": 4.652472080830095e-07, "loss": 0.0027, "num_tokens": 2812905149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.912690961850303, "frac_reward_zero_std": 1.0, "grad_norm": 4.551890702176194e-10, "kl": 0.0714111328125, "learning_rate": 4.634527696819946e-07, "loss": 0.0029, "num_tokens": 2813475181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9128616540069984, "frac_reward_zero_std": 1.0, "grad_norm": 6.028606101911707e-10, "kl": 0.072265625, "learning_rate": 4.616617164109982e-07, "loss": 0.0029, "num_tokens": 2814045613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9130323461636938, "frac_reward_zero_std": 1.0, "grad_norm": 5.056970364210575e-10, "kl": 0.068603515625, "learning_rate": 4.5987404890577846e-07, "loss": 0.0027, "num_tokens": 2814614525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9132030383203892, "frac_reward_zero_std": 1.0, "grad_norm": 5.543515376640185e-10, "kl": 0.06982421875, "learning_rate": 4.580897678008911e-07, "loss": 0.0028, "num_tokens": 2815181677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9133737304770846, "frac_reward_zero_std": 1.0, "grad_norm": 5.018133665954559e-10, "kl": 0.070068359375, "learning_rate": 4.563088737296928e-07, "loss": 0.0028, "num_tokens": 2815744909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.91354442263378, "frac_reward_zero_std": 1.0, "grad_norm": 5.108694772803227e-10, "kl": 0.06982421875, "learning_rate": 4.545313673243379e-07, "loss": 0.0028, "num_tokens": 2816313933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9137151147904754, "frac_reward_zero_std": 1.0, "grad_norm": 3.6737134302456246e-10, "kl": 0.0672607421875, "learning_rate": 4.5275724921577615e-07, "loss": 0.0027, "num_tokens": 2816896333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9138858069471708, "frac_reward_zero_std": 1.0, "grad_norm": 5.885817605227694e-10, "kl": 0.0701904296875, "learning_rate": 4.5098652003375486e-07, "loss": 0.0028, "num_tokens": 2817459085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9140564991038662, "frac_reward_zero_std": 1.0, "grad_norm": 5.626406128004355e-10, "kl": 0.067138671875, "learning_rate": 4.49219180406818e-07, "loss": 0.0027, "num_tokens": 2818029277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9142271912605616, "frac_reward_zero_std": 1.0, "grad_norm": 3.9164740585693405e-10, "kl": 0.07080078125, "learning_rate": 4.4745523096231145e-07, "loss": 0.0028, "num_tokens": 2818595005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.914397883417257, "frac_reward_zero_std": 1.0, "grad_norm": 5.350221137067374e-10, "kl": 0.0673828125, "learning_rate": 4.4569467232636997e-07, "loss": 0.0027, "num_tokens": 2819163821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9145685755739523, "frac_reward_zero_std": 1.0, "grad_norm": 4.939556662079029e-10, "kl": 0.0694580078125, "learning_rate": 4.4393750512392806e-07, "loss": 0.0028, "num_tokens": 2819728445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9147392677306477, "frac_reward_zero_std": 1.0, "grad_norm": 3.302310452874501e-10, "kl": 0.06640625, "learning_rate": 4.4218372997871796e-07, "loss": 0.0027, "num_tokens": 2820300989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9149099598873431, "frac_reward_zero_std": 1.0, "grad_norm": 5.71359131513102e-10, "kl": 0.070556640625, "learning_rate": 4.404333475132672e-07, "loss": 0.0028, "num_tokens": 2820864941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9150806520440385, "frac_reward_zero_std": 1.0, "grad_norm": 4.2163354902758846e-10, "kl": 0.0723876953125, "learning_rate": 4.386863583488965e-07, "loss": 0.0029, "num_tokens": 2821438157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.915251344200734, "frac_reward_zero_std": 1.0, "grad_norm": 5.443489027001588e-10, "kl": 0.06884765625, "learning_rate": 4.369427631057266e-07, "loss": 0.0028, "num_tokens": 2822008509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9154220363574294, "frac_reward_zero_std": 1.0, "grad_norm": 4.987374996375767e-10, "kl": 0.0716552734375, "learning_rate": 4.352025624026679e-07, "loss": 0.0029, "num_tokens": 2822577117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9155927285141248, "frac_reward_zero_std": 1.0, "grad_norm": 4.645402047110305e-10, "kl": 0.070556640625, "learning_rate": 4.3346575685743074e-07, "loss": 0.0028, "num_tokens": 2823142909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9157634206708202, "frac_reward_zero_std": 1.0, "grad_norm": 7.18440342148611e-10, "kl": 0.06884765625, "learning_rate": 4.317323470865176e-07, "loss": 0.0028, "num_tokens": 2823703165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9159341128275156, "frac_reward_zero_std": 1.0, "grad_norm": 6.511202457293448e-10, "kl": 0.0701904296875, "learning_rate": 4.3000233370522725e-07, "loss": 0.0028, "num_tokens": 2824268669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.916104804984211, "frac_reward_zero_std": 1.0, "grad_norm": 5.548084149263862e-10, "kl": 0.0675048828125, "learning_rate": 4.282757173276497e-07, "loss": 0.0027, "num_tokens": 2824835789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9162754971409064, "frac_reward_zero_std": 1.0, "grad_norm": 6.582902031231492e-10, "kl": 0.069580078125, "learning_rate": 4.265524985666758e-07, "loss": 0.0028, "num_tokens": 2825397229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9164461892976018, "frac_reward_zero_std": 1.0, "grad_norm": 6.370506831529183e-10, "kl": 0.0743408203125, "learning_rate": 4.2483267803398063e-07, "loss": 0.003, "num_tokens": 2825961389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9166168814542972, "frac_reward_zero_std": 1.0, "grad_norm": 4.48840949941023e-10, "kl": 0.067626953125, "learning_rate": 4.231162563400426e-07, "loss": 0.0027, "num_tokens": 2826524205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9167875736109926, "frac_reward_zero_std": 1.0, "grad_norm": 6.430576188824898e-10, "kl": 0.0711669921875, "learning_rate": 4.214032340941265e-07, "loss": 0.0028, "num_tokens": 2827084061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.916958265767688, "frac_reward_zero_std": 1.0, "grad_norm": 6.884619260968293e-10, "kl": 0.0684814453125, "learning_rate": 4.1969361190429713e-07, "loss": 0.0027, "num_tokens": 2827649197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9171289579243834, "frac_reward_zero_std": 1.0, "grad_norm": 6.452828278642806e-10, "kl": 0.0721435546875, "learning_rate": 4.1798739037740454e-07, "loss": 0.0029, "num_tokens": 2828210125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9172996500810787, "frac_reward_zero_std": 1.0, "grad_norm": 6.554589320533325e-10, "kl": 0.0711669921875, "learning_rate": 4.1628457011909874e-07, "loss": 0.0028, "num_tokens": 2828769917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9174703422377741, "frac_reward_zero_std": 1.0, "grad_norm": 5.363348503511594e-10, "kl": 0.0711669921875, "learning_rate": 4.145851517338162e-07, "loss": 0.0029, "num_tokens": 2829335661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9176410343944695, "frac_reward_zero_std": 1.0, "grad_norm": 5.89706473997266e-10, "kl": 0.072021484375, "learning_rate": 4.128891358247933e-07, "loss": 0.0029, "num_tokens": 2829897133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9178117265511649, "frac_reward_zero_std": 1.0, "grad_norm": 7.021737287475087e-10, "kl": 0.0726318359375, "learning_rate": 4.111965229940518e-07, "loss": 0.0029, "num_tokens": 2830458829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9179824187078603, "frac_reward_zero_std": 1.0, "grad_norm": 7.292050889391063e-10, "kl": 0.07275390625, "learning_rate": 4.0950731384240885e-07, "loss": 0.0029, "num_tokens": 2831020877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9181531108645558, "frac_reward_zero_std": 1.0, "grad_norm": 5.075981855162964e-10, "kl": 0.07275390625, "learning_rate": 4.078215089694715e-07, "loss": 0.0029, "num_tokens": 2831589293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9183238030212512, "frac_reward_zero_std": 1.0, "grad_norm": 2.490824472664266e-10, "kl": 0.068359375, "learning_rate": 4.061391089736422e-07, "loss": 0.0027, "num_tokens": 2832159101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9184944951779466, "frac_reward_zero_std": 1.0, "grad_norm": 4.895523841878842e-10, "kl": 0.073486328125, "learning_rate": 4.0446011445210876e-07, "loss": 0.0029, "num_tokens": 2832724125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.918665187334642, "frac_reward_zero_std": 1.0, "grad_norm": 7.624980591593549e-10, "kl": 0.0771484375, "learning_rate": 4.0278452600085674e-07, "loss": 0.0031, "num_tokens": 2833284653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9188358794913374, "frac_reward_zero_std": 1.0, "grad_norm": 7.706507339579162e-10, "kl": 0.0714111328125, "learning_rate": 4.0111234421465917e-07, "loss": 0.0029, "num_tokens": 2833844141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9190065716480328, "frac_reward_zero_std": 1.0, "grad_norm": 4.4340571715915565e-10, "kl": 0.0740966796875, "learning_rate": 3.994435696870791e-07, "loss": 0.003, "num_tokens": 2834409533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9191772638047282, "frac_reward_zero_std": 1.0, "grad_norm": 9.095646325973878e-10, "kl": 0.07177734375, "learning_rate": 3.9777820301047155e-07, "loss": 0.0029, "num_tokens": 2834970109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9193479559614236, "frac_reward_zero_std": 1.0, "grad_norm": 4.34192717399992e-10, "kl": 0.06982421875, "learning_rate": 3.9611624477598034e-07, "loss": 0.0028, "num_tokens": 2835545037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.919518648118119, "frac_reward_zero_std": 1.0, "grad_norm": 5.164443955378168e-10, "kl": 0.072021484375, "learning_rate": 3.9445769557354353e-07, "loss": 0.0029, "num_tokens": 2836122285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9196893402748144, "frac_reward_zero_std": 1.0, "grad_norm": 4.6142937055545955e-10, "kl": 0.0731201171875, "learning_rate": 3.9280255599188466e-07, "loss": 0.0029, "num_tokens": 2836687885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9198600324315098, "frac_reward_zero_std": 1.0, "grad_norm": 5.591240373205781e-10, "kl": 0.07080078125, "learning_rate": 3.911508266185171e-07, "loss": 0.0028, "num_tokens": 2837252077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9200307245882051, "frac_reward_zero_std": 1.0, "grad_norm": 7.804381471749255e-10, "kl": 0.0699462890625, "learning_rate": 3.895025080397474e-07, "loss": 0.0028, "num_tokens": 2837811133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9202014167449005, "frac_reward_zero_std": 1.0, "grad_norm": 5.484882124445047e-10, "kl": 0.0697021484375, "learning_rate": 3.878576008406687e-07, "loss": 0.0028, "num_tokens": 2838381261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9203721089015959, "frac_reward_zero_std": 1.0, "grad_norm": 5.532189765899037e-10, "kl": 0.0743408203125, "learning_rate": 3.862161056051628e-07, "loss": 0.003, "num_tokens": 2838946877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9205428010582913, "frac_reward_zero_std": 1.0, "grad_norm": 4.0338268555206704e-10, "kl": 0.065673828125, "learning_rate": 3.8457802291590127e-07, "loss": 0.0026, "num_tokens": 2839524909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9207134932149867, "frac_reward_zero_std": 1.0, "grad_norm": 6.208000927303283e-10, "kl": 0.0679931640625, "learning_rate": 3.8294335335434475e-07, "loss": 0.0027, "num_tokens": 2840088461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9208841853716822, "frac_reward_zero_std": 1.0, "grad_norm": 6.441000667973476e-10, "kl": 0.0718994140625, "learning_rate": 3.813120975007434e-07, "loss": 0.0029, "num_tokens": 2840653005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9210548775283776, "frac_reward_zero_std": 1.0, "grad_norm": 5.893008145626583e-10, "kl": 0.071533203125, "learning_rate": 3.796842559341329e-07, "loss": 0.0029, "num_tokens": 2841216029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.921225569685073, "frac_reward_zero_std": 1.0, "grad_norm": 4.66528072491231e-10, "kl": 0.068603515625, "learning_rate": 3.7805982923233674e-07, "loss": 0.0027, "num_tokens": 2841782365.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9213962618417684, "frac_reward_zero_std": 1.0, "grad_norm": 4.2862797090394613e-10, "kl": 0.0726318359375, "learning_rate": 3.7643881797196803e-07, "loss": 0.0029, "num_tokens": 2842354141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9215669539984638, "frac_reward_zero_std": 1.0, "grad_norm": 3.966633594722715e-10, "kl": 0.0687255859375, "learning_rate": 3.748212227284298e-07, "loss": 0.0028, "num_tokens": 2842923181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9217376461551592, "frac_reward_zero_std": 1.0, "grad_norm": 5.407926836204053e-10, "kl": 0.071044921875, "learning_rate": 3.732070440759106e-07, "loss": 0.0028, "num_tokens": 2843488557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9219083383118546, "frac_reward_zero_std": 1.0, "grad_norm": 7.336447559178688e-10, "kl": 0.0804443359375, "learning_rate": 3.715962825873798e-07, "loss": 0.0032, "num_tokens": 2844053181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.92207903046855, "frac_reward_zero_std": 1.0, "grad_norm": 5.908401157907606e-10, "kl": 0.06884765625, "learning_rate": 3.699889388346045e-07, "loss": 0.0028, "num_tokens": 2844616077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9222497226252454, "frac_reward_zero_std": 1.0, "grad_norm": 5.297706278416967e-10, "kl": 0.072998046875, "learning_rate": 3.6838501338813283e-07, "loss": 0.0029, "num_tokens": 2845181053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9224204147819408, "frac_reward_zero_std": 1.0, "grad_norm": 6.051337138651772e-10, "kl": 0.0703125, "learning_rate": 3.667845068173004e-07, "loss": 0.0028, "num_tokens": 2845744045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9225911069386362, "frac_reward_zero_std": 1.0, "grad_norm": 6.236329147737051e-10, "kl": 0.069091796875, "learning_rate": 3.651874196902294e-07, "loss": 0.0028, "num_tokens": 2846306973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9227617990953315, "frac_reward_zero_std": 1.0, "grad_norm": 4.3784607784802786e-10, "kl": 0.0723876953125, "learning_rate": 3.6359375257382643e-07, "loss": 0.0029, "num_tokens": 2846875901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9229324912520269, "frac_reward_zero_std": 1.0, "grad_norm": 2.7906017941030417e-10, "kl": 0.0684814453125, "learning_rate": 3.6200350603378677e-07, "loss": 0.0027, "num_tokens": 2847447565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9231031834087223, "frac_reward_zero_std": 1.0, "grad_norm": 5.80732155056621e-10, "kl": 0.0709228515625, "learning_rate": 3.60416680634591e-07, "loss": 0.0028, "num_tokens": 2848012237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9232738755654177, "frac_reward_zero_std": 1.0, "grad_norm": 4.879313814899044e-10, "kl": 0.0682373046875, "learning_rate": 3.588332769395064e-07, "loss": 0.0027, "num_tokens": 2848587789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9234445677221131, "frac_reward_zero_std": 1.0, "grad_norm": 5.570521180228812e-10, "kl": 0.0701904296875, "learning_rate": 3.5725329551058007e-07, "loss": 0.0028, "num_tokens": 2849152493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9236152598788085, "frac_reward_zero_std": 1.0, "grad_norm": 4.7532843001762e-10, "kl": 0.0697021484375, "learning_rate": 3.5567673690865114e-07, "loss": 0.0028, "num_tokens": 2849720829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.923785952035504, "frac_reward_zero_std": 1.0, "grad_norm": 6.581253026788896e-10, "kl": 0.0712890625, "learning_rate": 3.541036016933419e-07, "loss": 0.0028, "num_tokens": 2850284093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9239566441921994, "frac_reward_zero_std": 1.0, "grad_norm": 6.269737424413455e-10, "kl": 0.0689697265625, "learning_rate": 3.525338904230569e-07, "loss": 0.0028, "num_tokens": 2850849741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9241273363488948, "frac_reward_zero_std": 1.0, "grad_norm": 3.847070567458325e-10, "kl": 0.0679931640625, "learning_rate": 3.509676036549858e-07, "loss": 0.0027, "num_tokens": 2851421133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9242980285055902, "frac_reward_zero_std": 1.0, "grad_norm": 6.904442081163903e-10, "kl": 0.072265625, "learning_rate": 3.494047419451052e-07, "loss": 0.0029, "num_tokens": 2851984093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9244687206622856, "frac_reward_zero_std": 1.0, "grad_norm": 6.526042511657717e-10, "kl": 0.071044921875, "learning_rate": 3.4784530584817676e-07, "loss": 0.0028, "num_tokens": 2852548525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.924639412818981, "frac_reward_zero_std": 1.0, "grad_norm": 3.491926190306085e-10, "kl": 0.065673828125, "learning_rate": 3.462892959177411e-07, "loss": 0.0026, "num_tokens": 2853121453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9248101049756764, "frac_reward_zero_std": 1.0, "grad_norm": 4.698742949466891e-10, "kl": 0.072265625, "learning_rate": 3.447367127061263e-07, "loss": 0.0029, "num_tokens": 2853685405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9249807971323718, "frac_reward_zero_std": 1.0, "grad_norm": 5.615715897550322e-10, "kl": 0.0714111328125, "learning_rate": 3.431875567644427e-07, "loss": 0.0029, "num_tokens": 2854264429.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9251514892890672, "frac_reward_zero_std": 1.0, "grad_norm": 6.510091289598645e-10, "kl": 0.069091796875, "learning_rate": 3.416418286425871e-07, "loss": 0.0028, "num_tokens": 2854830413.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9253221814457626, "frac_reward_zero_std": 1.0, "grad_norm": 4.727027973051165e-10, "kl": 0.0682373046875, "learning_rate": 3.4009952888923506e-07, "loss": 0.0027, "num_tokens": 2855399613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.925492873602458, "frac_reward_zero_std": 1.0, "grad_norm": 5.027781024968742e-10, "kl": 0.070556640625, "learning_rate": 3.3856065805184746e-07, "loss": 0.0028, "num_tokens": 2855971325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9256635657591533, "frac_reward_zero_std": 1.0, "grad_norm": 5.416990487330705e-10, "kl": 0.0693359375, "learning_rate": 3.370252166766674e-07, "loss": 0.0028, "num_tokens": 2856539581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9258342579158487, "frac_reward_zero_std": 1.0, "grad_norm": 5.794747237177042e-10, "kl": 0.069580078125, "learning_rate": 3.3549320530872453e-07, "loss": 0.0028, "num_tokens": 2857100685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9260049500725441, "frac_reward_zero_std": 1.0, "grad_norm": 7.88575657826576e-10, "kl": 0.072998046875, "learning_rate": 3.3396462449182264e-07, "loss": 0.0029, "num_tokens": 2857661373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9261756422292395, "frac_reward_zero_std": 1.0, "grad_norm": 4.0144439316879505e-10, "kl": 0.0679931640625, "learning_rate": 3.3243947476855664e-07, "loss": 0.0027, "num_tokens": 2858230765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9263463343859349, "frac_reward_zero_std": 1.0, "grad_norm": 7.7494949627e-10, "kl": 0.0712890625, "learning_rate": 3.309177566802979e-07, "loss": 0.0029, "num_tokens": 2858790893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9265170265426304, "frac_reward_zero_std": 1.0, "grad_norm": 4.005942120846841e-10, "kl": 0.0682373046875, "learning_rate": 3.293994707672021e-07, "loss": 0.0027, "num_tokens": 2859359901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9266877186993258, "frac_reward_zero_std": 1.0, "grad_norm": 4.544250533342736e-10, "kl": 0.0704345703125, "learning_rate": 3.278846175682049e-07, "loss": 0.0028, "num_tokens": 2859925485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9268584108560212, "frac_reward_zero_std": 1.0, "grad_norm": 5.495460704372287e-10, "kl": 0.0709228515625, "learning_rate": 3.2637319762102606e-07, "loss": 0.0028, "num_tokens": 2860489949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9270291030127166, "frac_reward_zero_std": 1.0, "grad_norm": 4.3956404565977333e-10, "kl": 0.066650390625, "learning_rate": 3.248652114621631e-07, "loss": 0.0027, "num_tokens": 2861056381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.927199795169412, "frac_reward_zero_std": 1.0, "grad_norm": 5.9297574003934e-10, "kl": 0.0684814453125, "learning_rate": 3.23360659626899e-07, "loss": 0.0027, "num_tokens": 2861623997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9273704873261074, "frac_reward_zero_std": 1.0, "grad_norm": 6.661428653075395e-10, "kl": 0.072998046875, "learning_rate": 3.218595426492921e-07, "loss": 0.0029, "num_tokens": 2862186189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9275411794828028, "frac_reward_zero_std": 1.0, "grad_norm": 6.512319471246177e-10, "kl": 0.0728759765625, "learning_rate": 3.203618610621884e-07, "loss": 0.0029, "num_tokens": 2862749757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9277118716394982, "frac_reward_zero_std": 1.0, "grad_norm": 5.013456003775801e-10, "kl": 0.0677490234375, "learning_rate": 3.1886761539720813e-07, "loss": 0.0027, "num_tokens": 2863320669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9278825637961936, "frac_reward_zero_std": 1.0, "grad_norm": 6.556535540595207e-10, "kl": 0.0711669921875, "learning_rate": 3.1737680618475706e-07, "loss": 0.0029, "num_tokens": 2863892173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.928053255952889, "frac_reward_zero_std": 1.0, "grad_norm": 4.3862384552040584e-10, "kl": 0.06689453125, "learning_rate": 3.158894339540153e-07, "loss": 0.0027, "num_tokens": 2864456125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9282239481095844, "frac_reward_zero_std": 1.0, "grad_norm": 4.6170644332500774e-10, "kl": 0.0704345703125, "learning_rate": 3.1440549923295036e-07, "loss": 0.0028, "num_tokens": 2865024429.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9283946402662797, "frac_reward_zero_std": 1.0, "grad_norm": 5.494968859106895e-10, "kl": 0.0732421875, "learning_rate": 3.1292500254830106e-07, "loss": 0.0029, "num_tokens": 2865586477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9285653324229751, "frac_reward_zero_std": 1.0, "grad_norm": 7.603855696442615e-10, "kl": 0.0709228515625, "learning_rate": 3.114479444255947e-07, "loss": 0.0028, "num_tokens": 2866149965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9287360245796705, "frac_reward_zero_std": 1.0, "grad_norm": 5.884168734465156e-10, "kl": 0.0716552734375, "learning_rate": 3.099743253891296e-07, "loss": 0.0029, "num_tokens": 2866713453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9289067167363659, "frac_reward_zero_std": 1.0, "grad_norm": 4.96848534201518e-10, "kl": 0.0672607421875, "learning_rate": 3.0850414596198976e-07, "loss": 0.0027, "num_tokens": 2867282317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9290774088930613, "frac_reward_zero_std": 1.0, "grad_norm": 6.847749579625482e-10, "kl": 0.07470703125, "learning_rate": 3.0703740666603534e-07, "loss": 0.003, "num_tokens": 2867849277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9292481010497567, "frac_reward_zero_std": 1.0, "grad_norm": 6.058689364941334e-10, "kl": 0.07080078125, "learning_rate": 3.055741080219066e-07, "loss": 0.0028, "num_tokens": 2868414221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9294187932064522, "frac_reward_zero_std": 1.0, "grad_norm": 5.500822992736391e-10, "kl": 0.07080078125, "learning_rate": 3.041142505490191e-07, "loss": 0.0028, "num_tokens": 2868978125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9295894853631476, "frac_reward_zero_std": 1.0, "grad_norm": 4.0496105546475366e-10, "kl": 0.0675048828125, "learning_rate": 3.0265783476557174e-07, "loss": 0.0027, "num_tokens": 2869546557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.929760177519843, "frac_reward_zero_std": 1.0, "grad_norm": 5.785413264624471e-10, "kl": 0.0697021484375, "learning_rate": 3.012048611885399e-07, "loss": 0.0028, "num_tokens": 2870113021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9299308696765384, "frac_reward_zero_std": 1.0, "grad_norm": 7.429739022701984e-10, "kl": 0.0701904296875, "learning_rate": 2.997553303336753e-07, "loss": 0.0028, "num_tokens": 2870676157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9301015618332338, "frac_reward_zero_std": 1.0, "grad_norm": 3.9342317743975536e-10, "kl": 0.0716552734375, "learning_rate": 2.983092427155099e-07, "loss": 0.0029, "num_tokens": 2871243357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9302722539899292, "frac_reward_zero_std": 1.0, "grad_norm": 4.3117045165704926e-10, "kl": 0.0711669921875, "learning_rate": 2.968665988473518e-07, "loss": 0.0028, "num_tokens": 2871817645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9304429461466246, "frac_reward_zero_std": 1.0, "grad_norm": 5.072641329939039e-10, "kl": 0.0740966796875, "learning_rate": 2.954273992412893e-07, "loss": 0.003, "num_tokens": 2872383933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.93061363830332, "frac_reward_zero_std": 1.0, "grad_norm": 4.322189073897187e-10, "kl": 0.0712890625, "learning_rate": 2.9399164440818474e-07, "loss": 0.0029, "num_tokens": 2872957917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9307843304600154, "frac_reward_zero_std": 1.0, "grad_norm": 4.4129893806437084e-10, "kl": 0.067138671875, "learning_rate": 2.925593348576794e-07, "loss": 0.0027, "num_tokens": 2873524029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9309550226167108, "frac_reward_zero_std": 1.0, "grad_norm": 3.8953854105233606e-10, "kl": 0.0723876953125, "learning_rate": 2.91130471098191e-07, "loss": 0.0029, "num_tokens": 2874092925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9311257147734061, "frac_reward_zero_std": 1.0, "grad_norm": 6.062005000195162e-10, "kl": 0.072021484375, "learning_rate": 2.897050536369161e-07, "loss": 0.0029, "num_tokens": 2874660269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9312964069301015, "frac_reward_zero_std": 1.0, "grad_norm": 5.674947613024435e-10, "kl": 0.068115234375, "learning_rate": 2.8828308297982664e-07, "loss": 0.0027, "num_tokens": 2875225197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9314670990867969, "frac_reward_zero_std": 1.0, "grad_norm": 5.902593933690798e-10, "kl": 0.0684814453125, "learning_rate": 2.868645596316677e-07, "loss": 0.0027, "num_tokens": 2875790125.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9316377912434923, "frac_reward_zero_std": 1.0, "grad_norm": 4.375872844593473e-10, "kl": 0.069091796875, "learning_rate": 2.854494840959665e-07, "loss": 0.0028, "num_tokens": 2876359885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9318084834001877, "frac_reward_zero_std": 1.0, "grad_norm": 7.778354967649849e-10, "kl": 0.07080078125, "learning_rate": 2.840378568750235e-07, "loss": 0.0028, "num_tokens": 2876920493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9319791755568831, "frac_reward_zero_std": 1.0, "grad_norm": 5.52487135945067e-10, "kl": 0.07275390625, "learning_rate": 2.8262967846991787e-07, "loss": 0.0029, "num_tokens": 2877490461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9321498677135786, "frac_reward_zero_std": 1.0, "grad_norm": 4.834458705569574e-10, "kl": 0.0682373046875, "learning_rate": 2.812249493804964e-07, "loss": 0.0027, "num_tokens": 2878057149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.932320559870274, "frac_reward_zero_std": 1.0, "grad_norm": 6.175397310555091e-10, "kl": 0.07177734375, "learning_rate": 2.798236701053902e-07, "loss": 0.0029, "num_tokens": 2878620109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9324912520269694, "frac_reward_zero_std": 1.0, "grad_norm": 6.544997360832173e-10, "kl": 0.070556640625, "learning_rate": 2.784258411420038e-07, "loss": 0.0028, "num_tokens": 2879182989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9326619441836648, "frac_reward_zero_std": 1.0, "grad_norm": 4.532480831848329e-10, "kl": 0.07177734375, "learning_rate": 2.770314629865156e-07, "loss": 0.0029, "num_tokens": 2879750269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9328326363403602, "frac_reward_zero_std": 1.0, "grad_norm": 6.547247104513864e-10, "kl": 0.0750732421875, "learning_rate": 2.7564053613387873e-07, "loss": 0.003, "num_tokens": 2880312061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9330033284970556, "frac_reward_zero_std": 1.0, "grad_norm": 5.881291442692524e-10, "kl": 0.0709228515625, "learning_rate": 2.7425306107782157e-07, "loss": 0.0028, "num_tokens": 2880875533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.933174020653751, "frac_reward_zero_std": 1.0, "grad_norm": 3.895366159030004e-10, "kl": 0.073486328125, "learning_rate": 2.72869038310849e-07, "loss": 0.0029, "num_tokens": 2881446797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9333447128104464, "frac_reward_zero_std": 1.0, "grad_norm": 4.826116561535842e-10, "kl": 0.068603515625, "learning_rate": 2.7148846832423914e-07, "loss": 0.0027, "num_tokens": 2882013997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9335154049671418, "frac_reward_zero_std": 1.0, "grad_norm": 6.770207998135109e-10, "kl": 0.0689697265625, "learning_rate": 2.7011135160804335e-07, "loss": 0.0028, "num_tokens": 2882577181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9336860971238372, "frac_reward_zero_std": 1.0, "grad_norm": 5.123581420539896e-10, "kl": 0.068603515625, "learning_rate": 2.687376886510884e-07, "loss": 0.0027, "num_tokens": 2883142541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9338567892805325, "frac_reward_zero_std": 1.0, "grad_norm": 5.031255977764116e-10, "kl": 0.0723876953125, "learning_rate": 2.6736747994097536e-07, "loss": 0.0029, "num_tokens": 2883705709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9340274814372279, "frac_reward_zero_std": 1.0, "grad_norm": 5.249559617066467e-10, "kl": 0.0687255859375, "learning_rate": 2.660007259640807e-07, "loss": 0.0027, "num_tokens": 2884271149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9341981735939233, "frac_reward_zero_std": 1.0, "grad_norm": 4.206571502926996e-10, "kl": 0.07373046875, "learning_rate": 2.6463742720555076e-07, "loss": 0.003, "num_tokens": 2884837597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9343688657506187, "frac_reward_zero_std": 1.0, "grad_norm": 7.167172900904384e-10, "kl": 0.07177734375, "learning_rate": 2.6327758414930627e-07, "loss": 0.0029, "num_tokens": 2885399741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9345395579073141, "frac_reward_zero_std": 1.0, "grad_norm": 6.335135020861083e-10, "kl": 0.0692138671875, "learning_rate": 2.6192119727804445e-07, "loss": 0.0028, "num_tokens": 2885974589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9347102500640095, "frac_reward_zero_std": 1.0, "grad_norm": 4.2580323806040294e-10, "kl": 0.0689697265625, "learning_rate": 2.6056826707323454e-07, "loss": 0.0028, "num_tokens": 2886541133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.934880942220705, "frac_reward_zero_std": 1.0, "grad_norm": 5.49761046864584e-10, "kl": 0.0740966796875, "learning_rate": 2.5921879401511587e-07, "loss": 0.003, "num_tokens": 2887106925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9350516343774004, "frac_reward_zero_std": 1.0, "grad_norm": 6.58681024749604e-10, "kl": 0.0694580078125, "learning_rate": 2.5787277858270285e-07, "loss": 0.0028, "num_tokens": 2887669629.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9352223265340958, "frac_reward_zero_std": 1.0, "grad_norm": 4.600453202018122e-10, "kl": 0.0728759765625, "learning_rate": 2.5653022125378234e-07, "loss": 0.0029, "num_tokens": 2888235133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9353930186907912, "frac_reward_zero_std": 1.0, "grad_norm": 4.771654096050239e-10, "kl": 0.070556640625, "learning_rate": 2.5519112250491527e-07, "loss": 0.0028, "num_tokens": 2888805405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9355637108474866, "frac_reward_zero_std": 1.0, "grad_norm": 6.420125127402253e-10, "kl": 0.0718994140625, "learning_rate": 2.5385548281143257e-07, "loss": 0.0029, "num_tokens": 2889368349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.935734403004182, "frac_reward_zero_std": 1.0, "grad_norm": 5.105667277797105e-10, "kl": 0.0704345703125, "learning_rate": 2.525233026474361e-07, "loss": 0.0028, "num_tokens": 2889932157.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9359050951608774, "frac_reward_zero_std": 1.0, "grad_norm": 5.873950297930012e-10, "kl": 0.0740966796875, "learning_rate": 2.511945824858042e-07, "loss": 0.003, "num_tokens": 2890497597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9360757873175728, "frac_reward_zero_std": 1.0, "grad_norm": 7.089696902771989e-10, "kl": 0.0701904296875, "learning_rate": 2.498693227981852e-07, "loss": 0.0028, "num_tokens": 2891063869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9362464794742682, "frac_reward_zero_std": 1.0, "grad_norm": 5.013220580033299e-10, "kl": 0.069580078125, "learning_rate": 2.4854752405499505e-07, "loss": 0.0028, "num_tokens": 2891631709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9364171716309636, "frac_reward_zero_std": 1.0, "grad_norm": 5.19600712214014e-10, "kl": 0.070068359375, "learning_rate": 2.472291867254284e-07, "loss": 0.0028, "num_tokens": 2892199661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9365878637876589, "frac_reward_zero_std": 1.0, "grad_norm": 4.787329414716506e-10, "kl": 0.069580078125, "learning_rate": 2.4591431127744424e-07, "loss": 0.0028, "num_tokens": 2892771325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9367585559443543, "frac_reward_zero_std": 1.0, "grad_norm": 5.679071337212705e-10, "kl": 0.072509765625, "learning_rate": 2.446028981777793e-07, "loss": 0.0029, "num_tokens": 2893336477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9369292481010497, "frac_reward_zero_std": 1.0, "grad_norm": 6.810075487559823e-10, "kl": 0.0732421875, "learning_rate": 2.4329494789193443e-07, "loss": 0.0029, "num_tokens": 2893899325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9370999402577451, "frac_reward_zero_std": 1.0, "grad_norm": 5.554083914439877e-10, "kl": 0.0738525390625, "learning_rate": 2.419904608841861e-07, "loss": 0.003, "num_tokens": 2894463437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9372706324144405, "frac_reward_zero_std": 1.0, "grad_norm": 4.099186525007301e-10, "kl": 0.069580078125, "learning_rate": 2.4068943761758055e-07, "loss": 0.0028, "num_tokens": 2895042877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9374413245711359, "frac_reward_zero_std": 1.0, "grad_norm": 3.9989201513004544e-10, "kl": 0.0721435546875, "learning_rate": 2.39391878553934e-07, "loss": 0.0029, "num_tokens": 2895610669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9376120167278313, "frac_reward_zero_std": 1.0, "grad_norm": 6.379655534865359e-10, "kl": 0.06982421875, "learning_rate": 2.3809778415383233e-07, "loss": 0.0028, "num_tokens": 2896180669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9377827088845268, "frac_reward_zero_std": 1.0, "grad_norm": 4.96135532634296e-10, "kl": 0.07275390625, "learning_rate": 2.3680715487663374e-07, "loss": 0.0029, "num_tokens": 2896748141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9379534010412222, "frac_reward_zero_std": 1.0, "grad_norm": 6.454262426356784e-10, "kl": 0.07080078125, "learning_rate": 2.3551999118046286e-07, "loss": 0.0028, "num_tokens": 2897310717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9381240931979176, "frac_reward_zero_std": 1.0, "grad_norm": 7.099375035668197e-10, "kl": 0.0748291015625, "learning_rate": 2.3423629352221867e-07, "loss": 0.003, "num_tokens": 2897872541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.938294785354613, "frac_reward_zero_std": 1.0, "grad_norm": 6.654355866222745e-10, "kl": 0.0753173828125, "learning_rate": 2.3295606235756662e-07, "loss": 0.003, "num_tokens": 2898436173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9384654775113084, "frac_reward_zero_std": 1.0, "grad_norm": 4.09294076061893e-10, "kl": 0.0693359375, "learning_rate": 2.3167929814094214e-07, "loss": 0.0028, "num_tokens": 2899001469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9386361696680038, "frac_reward_zero_std": 1.0, "grad_norm": 5.757981843942513e-10, "kl": 0.0704345703125, "learning_rate": 2.3040600132554937e-07, "loss": 0.0028, "num_tokens": 2899565661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9388068618246992, "frac_reward_zero_std": 1.0, "grad_norm": 4.3084530473542854e-10, "kl": 0.068359375, "learning_rate": 2.291361723633656e-07, "loss": 0.0027, "num_tokens": 2900132765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9389775539813946, "frac_reward_zero_std": 1.0, "grad_norm": 4.747854649435188e-10, "kl": 0.069580078125, "learning_rate": 2.2786981170513255e-07, "loss": 0.0028, "num_tokens": 2900701357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.93914824613809, "frac_reward_zero_std": 1.0, "grad_norm": 5.767826867287614e-10, "kl": 0.0682373046875, "learning_rate": 2.2660691980036176e-07, "loss": 0.0027, "num_tokens": 2901266733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9393189382947853, "frac_reward_zero_std": 1.0, "grad_norm": 4.3814279571063923e-10, "kl": 0.0699462890625, "learning_rate": 2.2534749709733572e-07, "loss": 0.0028, "num_tokens": 2901834381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9394896304514807, "frac_reward_zero_std": 1.0, "grad_norm": 3.6820060421701775e-10, "kl": 0.0714111328125, "learning_rate": 2.240915440431046e-07, "loss": 0.0029, "num_tokens": 2902410733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9396603226081761, "frac_reward_zero_std": 1.0, "grad_norm": 7.517080747877437e-10, "kl": 0.072265625, "learning_rate": 2.2283906108348408e-07, "loss": 0.0029, "num_tokens": 2902973261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9398310147648715, "frac_reward_zero_std": 1.0, "grad_norm": 4.598058885153076e-10, "kl": 0.0692138671875, "learning_rate": 2.2159004866306068e-07, "loss": 0.0028, "num_tokens": 2903547581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9400017069215669, "frac_reward_zero_std": 1.0, "grad_norm": 4.767290877618369e-10, "kl": 0.0721435546875, "learning_rate": 2.2034450722519196e-07, "loss": 0.0029, "num_tokens": 2904116557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9401723990782623, "frac_reward_zero_std": 1.0, "grad_norm": 7.476804353073555e-10, "kl": 0.0689697265625, "learning_rate": 2.1910243721199765e-07, "loss": 0.0028, "num_tokens": 2904678029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9403430912349577, "frac_reward_zero_std": 1.0, "grad_norm": 4.420381231689418e-10, "kl": 0.066650390625, "learning_rate": 2.1786383906436614e-07, "loss": 0.0027, "num_tokens": 2905247021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9405137833916531, "frac_reward_zero_std": 1.0, "grad_norm": 7.096085329013258e-10, "kl": 0.072265625, "learning_rate": 2.16628713221958e-07, "loss": 0.0029, "num_tokens": 2905810493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9406844755483486, "frac_reward_zero_std": 1.0, "grad_norm": 5.833800966227809e-10, "kl": 0.0731201171875, "learning_rate": 2.1539706012319693e-07, "loss": 0.0029, "num_tokens": 2906375741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.940855167705044, "frac_reward_zero_std": 1.0, "grad_norm": 5.601484812265813e-10, "kl": 0.0684814453125, "learning_rate": 2.141688802052766e-07, "loss": 0.0027, "num_tokens": 2906946349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9410258598617394, "frac_reward_zero_std": 1.0, "grad_norm": 5.174461201946537e-10, "kl": 0.0718994140625, "learning_rate": 2.129441739041549e-07, "loss": 0.0029, "num_tokens": 2907512797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9411965520184348, "frac_reward_zero_std": 1.0, "grad_norm": 6.61310982082952e-10, "kl": 0.071044921875, "learning_rate": 2.1172294165455852e-07, "loss": 0.0028, "num_tokens": 2908081869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9413672441751302, "frac_reward_zero_std": 1.0, "grad_norm": 4.4559433464316027e-10, "kl": 0.06982421875, "learning_rate": 2.1050518388998188e-07, "loss": 0.0028, "num_tokens": 2908647565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9415379363318256, "frac_reward_zero_std": 1.0, "grad_norm": 4.3549639627239937e-10, "kl": 0.0743408203125, "learning_rate": 2.092909010426847e-07, "loss": 0.003, "num_tokens": 2909215501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.941708628488521, "frac_reward_zero_std": 1.0, "grad_norm": 5.424373080801858e-10, "kl": 0.069091796875, "learning_rate": 2.0808009354369328e-07, "loss": 0.0028, "num_tokens": 2909786797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9418793206452164, "frac_reward_zero_std": 1.0, "grad_norm": 4.4826501842277935e-10, "kl": 0.0662841796875, "learning_rate": 2.0687276182279948e-07, "loss": 0.0027, "num_tokens": 2910370013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9420500128019118, "frac_reward_zero_std": 1.0, "grad_norm": 5.488286995136889e-10, "kl": 0.0670166015625, "learning_rate": 2.0566890630856594e-07, "loss": 0.0027, "num_tokens": 2910936349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9422207049586071, "frac_reward_zero_std": 1.0, "grad_norm": 6.600382957719817e-10, "kl": 0.07080078125, "learning_rate": 2.0446852742831423e-07, "loss": 0.0028, "num_tokens": 2911500573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9423913971153025, "frac_reward_zero_std": 1.0, "grad_norm": 3.4084752544185704e-10, "kl": 0.0672607421875, "learning_rate": 2.0327162560813685e-07, "loss": 0.0027, "num_tokens": 2912071645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9425620892719979, "frac_reward_zero_std": 1.0, "grad_norm": 4.812664537318326e-10, "kl": 0.066162109375, "learning_rate": 2.0207820127289057e-07, "loss": 0.0026, "num_tokens": 2912644093.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9427327814286933, "frac_reward_zero_std": 1.0, "grad_norm": 5.380957402568959e-10, "kl": 0.0802001953125, "learning_rate": 2.008882548461988e-07, "loss": 0.0032, "num_tokens": 2913209053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9429034735853887, "frac_reward_zero_std": 1.0, "grad_norm": 4.845761770620936e-10, "kl": 0.0712890625, "learning_rate": 1.9970178675044916e-07, "loss": 0.0029, "num_tokens": 2913772701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9430741657420841, "frac_reward_zero_std": 1.0, "grad_norm": 5.759708750262422e-10, "kl": 0.0687255859375, "learning_rate": 1.985187974067937e-07, "loss": 0.0028, "num_tokens": 2914341133.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9432448578987795, "frac_reward_zero_std": 1.0, "grad_norm": 5.546765156157917e-10, "kl": 0.0653076171875, "learning_rate": 1.9733928723515204e-07, "loss": 0.0026, "num_tokens": 2914905405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.943415550055475, "frac_reward_zero_std": 1.0, "grad_norm": 4.911349822225508e-10, "kl": 0.0703125, "learning_rate": 1.9616325665420932e-07, "loss": 0.0028, "num_tokens": 2915472349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9435862422121704, "frac_reward_zero_std": 1.0, "grad_norm": 5.681800157751149e-10, "kl": 0.0740966796875, "learning_rate": 1.9499070608141048e-07, "loss": 0.003, "num_tokens": 2916035709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9437569343688658, "frac_reward_zero_std": 1.0, "grad_norm": 6.492911879416162e-10, "kl": 0.069091796875, "learning_rate": 1.9382163593297143e-07, "loss": 0.0028, "num_tokens": 2916599053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9439276265255612, "frac_reward_zero_std": 1.0, "grad_norm": 7.939428778868559e-10, "kl": 0.0714111328125, "learning_rate": 1.9265604662386804e-07, "loss": 0.0029, "num_tokens": 2917163501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9440983186822566, "frac_reward_zero_std": 1.0, "grad_norm": 6.248427394383139e-10, "kl": 0.070068359375, "learning_rate": 1.9149393856784493e-07, "loss": 0.0028, "num_tokens": 2917726397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.944269010838952, "frac_reward_zero_std": 1.0, "grad_norm": 5.566972674813696e-10, "kl": 0.069580078125, "learning_rate": 1.9033531217740542e-07, "loss": 0.0028, "num_tokens": 2918289069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9444397029956474, "frac_reward_zero_std": 1.0, "grad_norm": 6.32720093422231e-10, "kl": 0.0751953125, "learning_rate": 1.8918016786382276e-07, "loss": 0.003, "num_tokens": 2918850397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9446103951523428, "frac_reward_zero_std": 1.0, "grad_norm": 6.466880491741373e-10, "kl": 0.077392578125, "learning_rate": 1.8802850603712898e-07, "loss": 0.0031, "num_tokens": 2919413837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9447810873090382, "frac_reward_zero_std": 1.0, "grad_norm": 3.149600645150099e-10, "kl": 0.0692138671875, "learning_rate": 1.868803271061248e-07, "loss": 0.0028, "num_tokens": 2919981085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9449517794657335, "frac_reward_zero_std": 1.0, "grad_norm": 6.983813689390908e-10, "kl": 0.0726318359375, "learning_rate": 1.8573563147836983e-07, "loss": 0.0029, "num_tokens": 2920546029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9451224716224289, "frac_reward_zero_std": 1.0, "grad_norm": 6.656889957164358e-10, "kl": 0.0694580078125, "learning_rate": 1.845944195601923e-07, "loss": 0.0028, "num_tokens": 2921110669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9452931637791243, "frac_reward_zero_std": 1.0, "grad_norm": 5.955422160477821e-10, "kl": 0.06982421875, "learning_rate": 1.8345669175667935e-07, "loss": 0.0028, "num_tokens": 2921672253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9454638559358197, "frac_reward_zero_std": 1.0, "grad_norm": 5.125747971026933e-10, "kl": 0.070556640625, "learning_rate": 1.8232244847168456e-07, "loss": 0.0028, "num_tokens": 2922245021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9456345480925151, "frac_reward_zero_std": 1.0, "grad_norm": 5.828117706262332e-10, "kl": 0.069091796875, "learning_rate": 1.8119169010782257e-07, "loss": 0.0028, "num_tokens": 2922811933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9458052402492105, "frac_reward_zero_std": 1.0, "grad_norm": 6.023996206498884e-10, "kl": 0.06982421875, "learning_rate": 1.8006441706647225e-07, "loss": 0.0028, "num_tokens": 2923374829.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9459759324059059, "frac_reward_zero_std": 1.0, "grad_norm": 4.5097051962359905e-10, "kl": 0.072509765625, "learning_rate": 1.789406297477736e-07, "loss": 0.0029, "num_tokens": 2923944205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9461466245626013, "frac_reward_zero_std": 1.0, "grad_norm": 3.464640711975191e-10, "kl": 0.068603515625, "learning_rate": 1.7782032855063302e-07, "loss": 0.0027, "num_tokens": 2924521789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9463173167192968, "frac_reward_zero_std": 1.0, "grad_norm": 4.221374198509327e-10, "kl": 0.0704345703125, "learning_rate": 1.7670351387271356e-07, "loss": 0.0028, "num_tokens": 2925088285.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9464880088759922, "frac_reward_zero_std": 1.0, "grad_norm": 5.601118523724811e-10, "kl": 0.07470703125, "learning_rate": 1.7559018611044697e-07, "loss": 0.003, "num_tokens": 2925653997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9466587010326876, "frac_reward_zero_std": 1.0, "grad_norm": 5.454929358195179e-10, "kl": 0.07421875, "learning_rate": 1.7448034565902384e-07, "loss": 0.003, "num_tokens": 2926220269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.946829393189383, "frac_reward_zero_std": 1.0, "grad_norm": 5.450149665073435e-10, "kl": 0.0687255859375, "learning_rate": 1.7337399291239676e-07, "loss": 0.0028, "num_tokens": 2926789949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9470000853460784, "frac_reward_zero_std": 1.0, "grad_norm": 7.111531244830951e-10, "kl": 0.072265625, "learning_rate": 1.7227112826328162e-07, "loss": 0.0029, "num_tokens": 2927352589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9471707775027738, "frac_reward_zero_std": 1.0, "grad_norm": 6.519474197255826e-10, "kl": 0.0687255859375, "learning_rate": 1.7117175210315418e-07, "loss": 0.0027, "num_tokens": 2927916621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9473414696594692, "frac_reward_zero_std": 1.0, "grad_norm": 6.869071025725552e-10, "kl": 0.0711669921875, "learning_rate": 1.7007586482225557e-07, "loss": 0.0029, "num_tokens": 2928478109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9475121618161646, "frac_reward_zero_std": 1.0, "grad_norm": 8.172993637159771e-10, "kl": 0.0716552734375, "learning_rate": 1.6898346680958577e-07, "loss": 0.0029, "num_tokens": 2929041357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9476828539728599, "frac_reward_zero_std": 1.0, "grad_norm": 6.4199656349593e-10, "kl": 0.0704345703125, "learning_rate": 1.678945584529046e-07, "loss": 0.0028, "num_tokens": 2929608045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9478535461295553, "frac_reward_zero_std": 1.0, "grad_norm": 4.0985461856720354e-10, "kl": 0.07275390625, "learning_rate": 1.6680914013873616e-07, "loss": 0.0029, "num_tokens": 2930177053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9480242382862507, "frac_reward_zero_std": 1.0, "grad_norm": 5.190095607857222e-10, "kl": 0.0694580078125, "learning_rate": 1.6572721225236676e-07, "loss": 0.0028, "num_tokens": 2930744669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9481949304429461, "frac_reward_zero_std": 1.0, "grad_norm": 4.5563733978665206e-10, "kl": 0.0704345703125, "learning_rate": 1.6464877517784027e-07, "loss": 0.0028, "num_tokens": 2931309725.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9483656225996415, "frac_reward_zero_std": 1.0, "grad_norm": 5.015466049619505e-10, "kl": 0.0718994140625, "learning_rate": 1.6357382929796162e-07, "loss": 0.0029, "num_tokens": 2931878765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9485363147563369, "frac_reward_zero_std": 1.0, "grad_norm": 4.4765997891425627e-10, "kl": 0.0711669921875, "learning_rate": 1.6250237499429888e-07, "loss": 0.0028, "num_tokens": 2932447181.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9487070069130323, "frac_reward_zero_std": 1.0, "grad_norm": 3.4253673645943453e-10, "kl": 0.06787109375, "learning_rate": 1.6143441264718008e-07, "loss": 0.0027, "num_tokens": 2933019389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9488776990697277, "frac_reward_zero_std": 1.0, "grad_norm": 4.547351571617645e-10, "kl": 0.068359375, "learning_rate": 1.6036994263569307e-07, "loss": 0.0027, "num_tokens": 2933588877.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9490483912264231, "frac_reward_zero_std": 1.0, "grad_norm": 6.811462819650765e-10, "kl": 0.0709228515625, "learning_rate": 1.5930896533768559e-07, "loss": 0.0028, "num_tokens": 2934154109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9492190833831186, "frac_reward_zero_std": 1.0, "grad_norm": 4.3326602922045433e-10, "kl": 0.07080078125, "learning_rate": 1.5825148112976752e-07, "loss": 0.0028, "num_tokens": 2934721997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.949389775539814, "frac_reward_zero_std": 1.0, "grad_norm": 6.474222712003763e-10, "kl": 0.073486328125, "learning_rate": 1.5719749038730635e-07, "loss": 0.0029, "num_tokens": 2935292077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9495604676965094, "frac_reward_zero_std": 1.0, "grad_norm": 5.563387398742414e-10, "kl": 0.0706787109375, "learning_rate": 1.5614699348443286e-07, "loss": 0.0028, "num_tokens": 2935854029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9497311598532048, "frac_reward_zero_std": 1.0, "grad_norm": 6.528129142020684e-10, "kl": 0.072998046875, "learning_rate": 1.5509999079403314e-07, "loss": 0.0029, "num_tokens": 2936419917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9499018520099002, "frac_reward_zero_std": 1.0, "grad_norm": 7.541290446850408e-10, "kl": 0.0731201171875, "learning_rate": 1.5405648268775554e-07, "loss": 0.0029, "num_tokens": 2936978717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9500725441665956, "frac_reward_zero_std": 1.0, "grad_norm": 5.256311908438798e-10, "kl": 0.0692138671875, "learning_rate": 1.5301646953600812e-07, "loss": 0.0028, "num_tokens": 2937544957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.950243236323291, "frac_reward_zero_std": 1.0, "grad_norm": 2.913573701791277e-10, "kl": 0.0682373046875, "learning_rate": 1.5197995170795897e-07, "loss": 0.0027, "num_tokens": 2938111373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9504139284799863, "frac_reward_zero_std": 1.0, "grad_norm": 9.033521165414068e-10, "kl": 0.0740966796875, "learning_rate": 1.509469295715349e-07, "loss": 0.003, "num_tokens": 2938668397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9505846206366817, "frac_reward_zero_std": 1.0, "grad_norm": 4.359113499386375e-10, "kl": 0.0689697265625, "learning_rate": 1.499174034934192e-07, "loss": 0.0028, "num_tokens": 2939237693.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9507553127933771, "frac_reward_zero_std": 1.0, "grad_norm": 3.6379891969226836e-10, "kl": 0.0665283203125, "learning_rate": 1.4889137383905737e-07, "loss": 0.0027, "num_tokens": 2939808749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9509260049500725, "frac_reward_zero_std": 1.0, "grad_norm": 5.358924159616122e-10, "kl": 0.06884765625, "learning_rate": 1.4786884097265475e-07, "loss": 0.0028, "num_tokens": 2940373405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9510966971067679, "frac_reward_zero_std": 1.0, "grad_norm": 6.173950131807703e-10, "kl": 0.0745849609375, "learning_rate": 1.4684980525717096e-07, "loss": 0.003, "num_tokens": 2940939597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9512673892634633, "frac_reward_zero_std": 1.0, "grad_norm": 4.711987629185803e-10, "kl": 0.0701904296875, "learning_rate": 1.458342670543278e-07, "loss": 0.0028, "num_tokens": 2941506269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9514380814201587, "frac_reward_zero_std": 1.0, "grad_norm": 5.626442803642353e-10, "kl": 0.075439453125, "learning_rate": 1.4482222672460466e-07, "loss": 0.003, "num_tokens": 2942071869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9516087735768541, "frac_reward_zero_std": 1.0, "grad_norm": 5.327666861932812e-10, "kl": 0.0670166015625, "learning_rate": 1.4381368462724088e-07, "loss": 0.0027, "num_tokens": 2942638637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9517794657335495, "frac_reward_zero_std": 1.0, "grad_norm": 4.828134385960171e-10, "kl": 0.06787109375, "learning_rate": 1.4280864112023118e-07, "loss": 0.0027, "num_tokens": 2943205949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.951950157890245, "frac_reward_zero_std": 1.0, "grad_norm": 7.822793050555349e-10, "kl": 0.073486328125, "learning_rate": 1.41807096560328e-07, "loss": 0.0029, "num_tokens": 2943770749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9521208500469404, "frac_reward_zero_std": 1.0, "grad_norm": 5.222464184109882e-10, "kl": 0.0697021484375, "learning_rate": 1.408090513030469e-07, "loss": 0.0028, "num_tokens": 2944339021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9522915422036358, "frac_reward_zero_std": 1.0, "grad_norm": 4.859434604947044e-10, "kl": 0.0670166015625, "learning_rate": 1.3981450570265564e-07, "loss": 0.0027, "num_tokens": 2944908525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9524622343603312, "frac_reward_zero_std": 1.0, "grad_norm": 3.6072518746681286e-10, "kl": 0.0723876953125, "learning_rate": 1.3882346011218295e-07, "loss": 0.0029, "num_tokens": 2945479773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9526329265170266, "frac_reward_zero_std": 1.0, "grad_norm": 3.8793767965994865e-10, "kl": 0.0701904296875, "learning_rate": 1.3783591488341296e-07, "loss": 0.0028, "num_tokens": 2946048301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.952803618673722, "frac_reward_zero_std": 1.0, "grad_norm": 6.054586470322512e-10, "kl": 0.0731201171875, "learning_rate": 1.3685187036688975e-07, "loss": 0.0029, "num_tokens": 2946615245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9529743108304174, "frac_reward_zero_std": 1.0, "grad_norm": 4.6183087707003465e-10, "kl": 0.0714111328125, "learning_rate": 1.3587132691191385e-07, "loss": 0.0029, "num_tokens": 2947181997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9531450029871127, "frac_reward_zero_std": 1.0, "grad_norm": 7.177427333463842e-10, "kl": 0.0721435546875, "learning_rate": 1.3489428486654243e-07, "loss": 0.0029, "num_tokens": 2947747933.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9533156951438081, "frac_reward_zero_std": 1.0, "grad_norm": 5.439287439813316e-10, "kl": 0.07080078125, "learning_rate": 1.339207445775892e-07, "loss": 0.0028, "num_tokens": 2948314749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9534863873005035, "frac_reward_zero_std": 1.0, "grad_norm": 5.748961807958669e-10, "kl": 0.0662841796875, "learning_rate": 1.3295070639062658e-07, "loss": 0.0027, "num_tokens": 2948879901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9536570794571989, "frac_reward_zero_std": 1.0, "grad_norm": 4.707454295899448e-10, "kl": 0.0657958984375, "learning_rate": 1.319841706499847e-07, "loss": 0.0026, "num_tokens": 2949445869.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9538277716138943, "frac_reward_zero_std": 1.0, "grad_norm": 4.2003376871096303e-10, "kl": 0.0677490234375, "learning_rate": 1.3102113769874581e-07, "loss": 0.0027, "num_tokens": 2950013277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9539984637705897, "frac_reward_zero_std": 1.0, "grad_norm": 6.201863884083533e-10, "kl": 0.070068359375, "learning_rate": 1.3006160787875422e-07, "loss": 0.0028, "num_tokens": 2950577789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9541691559272851, "frac_reward_zero_std": 1.0, "grad_norm": 3.8448051583448345e-10, "kl": 0.0689697265625, "learning_rate": 1.2910558153060638e-07, "loss": 0.0028, "num_tokens": 2951145581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9543398480839805, "frac_reward_zero_std": 1.0, "grad_norm": 6.192835838995824e-10, "kl": 0.072021484375, "learning_rate": 1.2815305899365972e-07, "loss": 0.0029, "num_tokens": 2951705421.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9545105402406759, "frac_reward_zero_std": 1.0, "grad_norm": 4.737711220084071e-10, "kl": 0.072021484375, "learning_rate": 1.2720404060602266e-07, "loss": 0.0029, "num_tokens": 2952272493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9546812323973713, "frac_reward_zero_std": 1.0, "grad_norm": 4.055067227677549e-10, "kl": 0.070068359375, "learning_rate": 1.2625852670456462e-07, "loss": 0.0028, "num_tokens": 2952840605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9548519245540668, "frac_reward_zero_std": 1.0, "grad_norm": 6.522942016427625e-10, "kl": 0.0716552734375, "learning_rate": 1.2531651762490716e-07, "loss": 0.0029, "num_tokens": 2953432573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9550226167107622, "frac_reward_zero_std": 1.0, "grad_norm": 7.726477118277179e-10, "kl": 0.0728759765625, "learning_rate": 1.2437801370143166e-07, "loss": 0.0029, "num_tokens": 2953994061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9551933088674576, "frac_reward_zero_std": 1.0, "grad_norm": 4.425855455371447e-10, "kl": 0.0701904296875, "learning_rate": 1.2344301526727053e-07, "loss": 0.0028, "num_tokens": 2954559101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.955364001024153, "frac_reward_zero_std": 1.0, "grad_norm": 5.998289886018982e-10, "kl": 0.0711669921875, "learning_rate": 1.2251152265431608e-07, "loss": 0.0028, "num_tokens": 2955121453.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9555346931808484, "frac_reward_zero_std": 1.0, "grad_norm": 5.327571005199402e-10, "kl": 0.070068359375, "learning_rate": 1.2158353619321382e-07, "loss": 0.0028, "num_tokens": 2955688605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9557053853375438, "frac_reward_zero_std": 1.0, "grad_norm": 5.406412595330243e-10, "kl": 0.0687255859375, "learning_rate": 1.2065905621336692e-07, "loss": 0.0028, "num_tokens": 2956252989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9558760774942391, "frac_reward_zero_std": 1.0, "grad_norm": 7.18157208790643e-10, "kl": 0.0772705078125, "learning_rate": 1.1973808304293067e-07, "loss": 0.0031, "num_tokens": 2956816541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9560467696509345, "frac_reward_zero_std": 1.0, "grad_norm": 4.951680555819651e-10, "kl": 0.0755615234375, "learning_rate": 1.1882061700881908e-07, "loss": 0.003, "num_tokens": 2957384973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9562174618076299, "frac_reward_zero_std": 1.0, "grad_norm": 4.722678437614331e-10, "kl": 0.0709228515625, "learning_rate": 1.1790665843669614e-07, "loss": 0.0028, "num_tokens": 2957953213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9563881539643253, "frac_reward_zero_std": 1.0, "grad_norm": 4.831185371302691e-10, "kl": 0.0694580078125, "learning_rate": 1.1699620765098785e-07, "loss": 0.0028, "num_tokens": 2958522573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9565588461210207, "frac_reward_zero_std": 1.0, "grad_norm": 6.734843224615965e-10, "kl": 0.06787109375, "learning_rate": 1.1608926497487017e-07, "loss": 0.0027, "num_tokens": 2959084397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9567295382777161, "frac_reward_zero_std": 1.0, "grad_norm": 6.543775216612269e-10, "kl": 0.0712890625, "learning_rate": 1.1518583073027334e-07, "loss": 0.0029, "num_tokens": 2959648397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9569002304344115, "frac_reward_zero_std": 1.0, "grad_norm": 4.531152596344068e-10, "kl": 0.06787109375, "learning_rate": 1.142859052378864e-07, "loss": 0.0027, "num_tokens": 2960216045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9570709225911069, "frac_reward_zero_std": 1.0, "grad_norm": 5.056579205593869e-10, "kl": 0.069580078125, "learning_rate": 1.133894888171494e-07, "loss": 0.0028, "num_tokens": 2960782301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9572416147478023, "frac_reward_zero_std": 1.0, "grad_norm": 5.746829684421882e-10, "kl": 0.070068359375, "learning_rate": 1.1249658178625777e-07, "loss": 0.0028, "num_tokens": 2961349389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9574123069044977, "frac_reward_zero_std": 1.0, "grad_norm": 6.06387858997422e-10, "kl": 0.0687255859375, "learning_rate": 1.1160718446216023e-07, "loss": 0.0027, "num_tokens": 2961912781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9575829990611932, "frac_reward_zero_std": 1.0, "grad_norm": 4.616168070297176e-10, "kl": 0.0716552734375, "learning_rate": 1.1072129716056312e-07, "loss": 0.0029, "num_tokens": 2962480141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9577536912178886, "frac_reward_zero_std": 1.0, "grad_norm": 2.3875773205744477e-10, "kl": 0.068115234375, "learning_rate": 1.0983892019592269e-07, "loss": 0.0027, "num_tokens": 2963052941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.957924383374584, "frac_reward_zero_std": 1.0, "grad_norm": 7.961751248544794e-10, "kl": 0.0750732421875, "learning_rate": 1.089600538814506e-07, "loss": 0.003, "num_tokens": 2963613885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9580950755312794, "frac_reward_zero_std": 1.0, "grad_norm": 5.415203518603049e-10, "kl": 0.0677490234375, "learning_rate": 1.0808469852911285e-07, "loss": 0.0027, "num_tokens": 2964183533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9582657676879748, "frac_reward_zero_std": 1.0, "grad_norm": 5.060859716627054e-10, "kl": 0.0703125, "learning_rate": 1.0721285444963092e-07, "loss": 0.0028, "num_tokens": 2964746173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9584364598446702, "frac_reward_zero_std": 1.0, "grad_norm": 5.595861681761573e-10, "kl": 0.070068359375, "learning_rate": 1.0634452195247613e-07, "loss": 0.0028, "num_tokens": 2965310573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9586071520013656, "frac_reward_zero_std": 1.0, "grad_norm": 5.04286478967332e-10, "kl": 0.0718994140625, "learning_rate": 1.0547970134587527e-07, "loss": 0.0029, "num_tokens": 2965881261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9587778441580609, "frac_reward_zero_std": 1.0, "grad_norm": 6.978555997860327e-10, "kl": 0.073486328125, "learning_rate": 1.0461839293680831e-07, "loss": 0.0029, "num_tokens": 2966443645.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9589485363147563, "frac_reward_zero_std": 1.0, "grad_norm": 4.940050977297651e-10, "kl": 0.06640625, "learning_rate": 1.037605970310096e-07, "loss": 0.0027, "num_tokens": 2967013229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9591192284714517, "frac_reward_zero_std": 1.0, "grad_norm": 7.0792766852762e-10, "kl": 0.0711669921875, "learning_rate": 1.0290631393296557e-07, "loss": 0.0028, "num_tokens": 2967575389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9592899206281471, "frac_reward_zero_std": 1.0, "grad_norm": 3.009422345995612e-10, "kl": 0.068115234375, "learning_rate": 1.0205554394591589e-07, "loss": 0.0027, "num_tokens": 2968146029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9594606127848425, "frac_reward_zero_std": 1.0, "grad_norm": 4.792821315749383e-10, "kl": 0.0714111328125, "learning_rate": 1.0120828737185118e-07, "loss": 0.0029, "num_tokens": 2968716381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9596313049415379, "frac_reward_zero_std": 1.0, "grad_norm": 6.09243045937583e-10, "kl": 0.0694580078125, "learning_rate": 1.0036454451151978e-07, "loss": 0.0028, "num_tokens": 2969288509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9598019970982333, "frac_reward_zero_std": 1.0, "grad_norm": 5.362839900458031e-10, "kl": 0.0711669921875, "learning_rate": 9.952431566441877e-08, "loss": 0.0028, "num_tokens": 2969854653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9599726892549287, "frac_reward_zero_std": 1.0, "grad_norm": 5.726707912753584e-10, "kl": 0.0721435546875, "learning_rate": 9.86876011287996e-08, "loss": 0.0029, "num_tokens": 2970422973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9601433814116241, "frac_reward_zero_std": 1.0, "grad_norm": 4.395993451209321e-10, "kl": 0.06982421875, "learning_rate": 9.785440120166356e-08, "loss": 0.0028, "num_tokens": 2971015037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9603140735683195, "frac_reward_zero_std": 1.0, "grad_norm": 5.432496565723088e-10, "kl": 0.068603515625, "learning_rate": 9.702471617876851e-08, "loss": 0.0027, "num_tokens": 2971582685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.960484765725015, "frac_reward_zero_std": 1.0, "grad_norm": 6.241037565132216e-10, "kl": 0.0794677734375, "learning_rate": 9.619854635462445e-08, "loss": 0.0032, "num_tokens": 2972148669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9606554578817104, "frac_reward_zero_std": 1.0, "grad_norm": 6.152341117162265e-10, "kl": 0.0689697265625, "learning_rate": 9.537589202248788e-08, "loss": 0.0028, "num_tokens": 2972712717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9608261500384058, "frac_reward_zero_std": 1.0, "grad_norm": 4.2669863162253156e-10, "kl": 0.0721435546875, "learning_rate": 9.455675347437299e-08, "loss": 0.0029, "num_tokens": 2973284637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9609968421951012, "frac_reward_zero_std": 1.0, "grad_norm": 8.759386456129145e-10, "kl": 0.0726318359375, "learning_rate": 9.374113100104498e-08, "loss": 0.0029, "num_tokens": 2973846893.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9611675343517966, "frac_reward_zero_std": 1.0, "grad_norm": 6.047402842464688e-10, "kl": 0.0682373046875, "learning_rate": 9.292902489202005e-08, "loss": 0.0027, "num_tokens": 2974412253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.961338226508492, "frac_reward_zero_std": 1.0, "grad_norm": 4.776998250517763e-10, "kl": 0.0723876953125, "learning_rate": 9.212043543556759e-08, "loss": 0.0029, "num_tokens": 2974980957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9615089186651873, "frac_reward_zero_std": 1.0, "grad_norm": 5.383759750255694e-10, "kl": 0.070556640625, "learning_rate": 9.13153629187058e-08, "loss": 0.0028, "num_tokens": 2975542845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9616796108218827, "frac_reward_zero_std": 1.0, "grad_norm": 4.517207969232431e-10, "kl": 0.0718994140625, "learning_rate": 9.051380762720607e-08, "loss": 0.0029, "num_tokens": 2976108045.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9618503029785781, "frac_reward_zero_std": 1.0, "grad_norm": 6.121703828328587e-10, "kl": 0.0689697265625, "learning_rate": 8.971576984559416e-08, "loss": 0.0028, "num_tokens": 2976670397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9620209951352735, "frac_reward_zero_std": 1.0, "grad_norm": 5.292358323623247e-10, "kl": 0.072021484375, "learning_rate": 8.892124985714346e-08, "loss": 0.0029, "num_tokens": 2977235773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9621916872919689, "frac_reward_zero_std": 1.0, "grad_norm": 7.802677028067079e-10, "kl": 0.0731201171875, "learning_rate": 8.813024794387947e-08, "loss": 0.0029, "num_tokens": 2977793661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9623623794486643, "frac_reward_zero_std": 1.0, "grad_norm": 4.1660565601118823e-10, "kl": 0.070556640625, "learning_rate": 8.734276438657874e-08, "loss": 0.0028, "num_tokens": 2978362029.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9625330716053597, "frac_reward_zero_std": 1.0, "grad_norm": 8.152641588805205e-10, "kl": 0.0687255859375, "learning_rate": 8.655879946477097e-08, "loss": 0.0027, "num_tokens": 2978920989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9627037637620551, "frac_reward_zero_std": 1.0, "grad_norm": 4.543561475137203e-10, "kl": 0.0692138671875, "learning_rate": 8.577835345673468e-08, "loss": 0.0028, "num_tokens": 2979486493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9628744559187505, "frac_reward_zero_std": 1.0, "grad_norm": 6.127371440915633e-10, "kl": 0.067626953125, "learning_rate": 8.50014266394994e-08, "loss": 0.0027, "num_tokens": 2980050989.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.963045148075446, "frac_reward_zero_std": 1.0, "grad_norm": 6.621059117909574e-10, "kl": 0.0684814453125, "learning_rate": 8.422801928884671e-08, "loss": 0.0027, "num_tokens": 2980613533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9632158402321414, "frac_reward_zero_std": 1.0, "grad_norm": 4.6496081849044093e-10, "kl": 0.069580078125, "learning_rate": 8.345813167930927e-08, "loss": 0.0028, "num_tokens": 2981184685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9633865323888368, "frac_reward_zero_std": 1.0, "grad_norm": 5.103965154541569e-10, "kl": 0.0701904296875, "learning_rate": 8.269176408416846e-08, "loss": 0.0028, "num_tokens": 2981752397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9635572245455322, "frac_reward_zero_std": 1.0, "grad_norm": 5.423323007710762e-10, "kl": 0.069580078125, "learning_rate": 8.192891677545667e-08, "loss": 0.0028, "num_tokens": 2982317373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9637279167022276, "frac_reward_zero_std": 1.0, "grad_norm": 7.274130332328748e-10, "kl": 0.0703125, "learning_rate": 8.11695900239573e-08, "loss": 0.0028, "num_tokens": 2982880653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.963898608858923, "frac_reward_zero_std": 1.0, "grad_norm": 4.6563476850560855e-10, "kl": 0.06787109375, "learning_rate": 8.041378409920476e-08, "loss": 0.0027, "num_tokens": 2983448845.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9640693010156184, "frac_reward_zero_std": 1.0, "grad_norm": 4.712360520229262e-10, "kl": 0.068603515625, "learning_rate": 7.96614992694822e-08, "loss": 0.0027, "num_tokens": 2984029949.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9642399931723137, "frac_reward_zero_std": 1.0, "grad_norm": 4.58012802564722e-10, "kl": 0.0677490234375, "learning_rate": 7.891273580182379e-08, "loss": 0.0027, "num_tokens": 2984593501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9644106853290091, "frac_reward_zero_std": 1.0, "grad_norm": 4.4345882901112667e-10, "kl": 0.0665283203125, "learning_rate": 7.816749396201362e-08, "loss": 0.0027, "num_tokens": 2985165309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9645813774857045, "frac_reward_zero_std": 1.0, "grad_norm": 5.902599504653259e-10, "kl": 0.07275390625, "learning_rate": 7.742577401458562e-08, "loss": 0.0029, "num_tokens": 2985730173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9647520696423999, "frac_reward_zero_std": 1.0, "grad_norm": 2.741307558683669e-10, "kl": 0.0706787109375, "learning_rate": 7.668757622282253e-08, "loss": 0.0028, "num_tokens": 2986301037.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9649227617990953, "frac_reward_zero_std": 1.0, "grad_norm": 5.228540216869186e-10, "kl": 0.0740966796875, "learning_rate": 7.595290084876029e-08, "loss": 0.003, "num_tokens": 2986864381.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9650934539557907, "frac_reward_zero_std": 1.0, "grad_norm": 6.389383736660255e-10, "kl": 0.0728759765625, "learning_rate": 7.522174815318028e-08, "loss": 0.0029, "num_tokens": 2987427917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9652641461124861, "frac_reward_zero_std": 1.0, "grad_norm": 4.783287808185112e-10, "kl": 0.06884765625, "learning_rate": 7.449411839561494e-08, "loss": 0.0028, "num_tokens": 2988009805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9654348382691815, "frac_reward_zero_std": 1.0, "grad_norm": 3.60633217911233e-10, "kl": 0.0689697265625, "learning_rate": 7.377001183434761e-08, "loss": 0.0028, "num_tokens": 2988582189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9656055304258769, "frac_reward_zero_std": 1.0, "grad_norm": 5.508416642579713e-10, "kl": 0.0718994140625, "learning_rate": 7.304942872641052e-08, "loss": 0.0029, "num_tokens": 2989146461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9657762225825723, "frac_reward_zero_std": 1.0, "grad_norm": 4.855646414828629e-10, "kl": 0.0772705078125, "learning_rate": 7.233236932758347e-08, "loss": 0.0031, "num_tokens": 2989716541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9659469147392677, "frac_reward_zero_std": 1.0, "grad_norm": 3.73850787921421e-10, "kl": 0.0709228515625, "learning_rate": 7.161883389239732e-08, "loss": 0.0028, "num_tokens": 2990286781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9661176068959632, "frac_reward_zero_std": 1.0, "grad_norm": 5.121780838379213e-10, "kl": 0.0716552734375, "learning_rate": 7.090882267413058e-08, "loss": 0.0029, "num_tokens": 2990851965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9662882990526586, "frac_reward_zero_std": 1.0, "grad_norm": 4.3608660333736895e-10, "kl": 0.0740966796875, "learning_rate": 7.020233592481273e-08, "loss": 0.003, "num_tokens": 2991416541.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.966458991209354, "frac_reward_zero_std": 1.0, "grad_norm": 6.131727333974378e-10, "kl": 0.0706787109375, "learning_rate": 6.949937389521988e-08, "loss": 0.0028, "num_tokens": 2991982109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9666296833660494, "frac_reward_zero_std": 1.0, "grad_norm": 4.236321814047918e-10, "kl": 0.0701904296875, "learning_rate": 6.879993683488018e-08, "loss": 0.0028, "num_tokens": 2992550637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9668003755227448, "frac_reward_zero_std": 1.0, "grad_norm": 6.633979785208128e-10, "kl": 0.071533203125, "learning_rate": 6.810402499206614e-08, "loss": 0.0029, "num_tokens": 2993114205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9669710676794401, "frac_reward_zero_std": 1.0, "grad_norm": 8.487023379738665e-10, "kl": 0.0712890625, "learning_rate": 6.74116386138024e-08, "loss": 0.0028, "num_tokens": 2993673805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9671417598361355, "frac_reward_zero_std": 1.0, "grad_norm": 7.307363010920985e-10, "kl": 0.0736083984375, "learning_rate": 6.672277794586124e-08, "loss": 0.0029, "num_tokens": 2994237917.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9673124519928309, "frac_reward_zero_std": 1.0, "grad_norm": 7.650764500234176e-10, "kl": 0.072998046875, "learning_rate": 6.603744323276262e-08, "loss": 0.0029, "num_tokens": 2994803773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9674831441495263, "frac_reward_zero_std": 1.0, "grad_norm": 4.52116799480178e-10, "kl": 0.0633544921875, "learning_rate": 6.53556347177764e-08, "loss": 0.0025, "num_tokens": 2995367997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9676538363062217, "frac_reward_zero_std": 1.0, "grad_norm": 6.813571731247012e-10, "kl": 0.0740966796875, "learning_rate": 6.467735264292008e-08, "loss": 0.003, "num_tokens": 2995931405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9678245284629171, "frac_reward_zero_std": 1.0, "grad_norm": 7.761667019798323e-10, "kl": 0.0704345703125, "learning_rate": 6.400259724895996e-08, "loss": 0.0028, "num_tokens": 2996490061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9679952206196125, "frac_reward_zero_std": 1.0, "grad_norm": 6.224135017016304e-10, "kl": 0.0704345703125, "learning_rate": 6.33313687754078e-08, "loss": 0.0028, "num_tokens": 2997059117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9681659127763079, "frac_reward_zero_std": 1.0, "grad_norm": 5.283920881164444e-10, "kl": 0.0714111328125, "learning_rate": 6.266366746052633e-08, "loss": 0.0029, "num_tokens": 2997624445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9683366049330033, "frac_reward_zero_std": 1.0, "grad_norm": 6.355246629530776e-10, "kl": 0.0675048828125, "learning_rate": 6.199949354132595e-08, "loss": 0.0027, "num_tokens": 2998190749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9685072970896987, "frac_reward_zero_std": 1.0, "grad_norm": 5.392386893984815e-10, "kl": 0.0682373046875, "learning_rate": 6.133884725356476e-08, "loss": 0.0027, "num_tokens": 2998755325.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9686779892463941, "frac_reward_zero_std": 1.0, "grad_norm": 7.605333439596061e-10, "kl": 0.078125, "learning_rate": 6.068172883174738e-08, "loss": 0.0031, "num_tokens": 2999321261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9688486814030896, "frac_reward_zero_std": 1.0, "grad_norm": 3.230230102442239e-10, "kl": 0.0689697265625, "learning_rate": 6.002813850912726e-08, "loss": 0.0028, "num_tokens": 2999892109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.969019373559785, "frac_reward_zero_std": 1.0, "grad_norm": 5.526917619940177e-10, "kl": 0.0767822265625, "learning_rate": 5.937807651770544e-08, "loss": 0.0031, "num_tokens": 3000454701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9691900657164804, "frac_reward_zero_std": 1.0, "grad_norm": 4.325891608791047e-10, "kl": 0.0699462890625, "learning_rate": 5.87315430882307e-08, "loss": 0.0028, "num_tokens": 3001022653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9693607578731758, "frac_reward_zero_std": 1.0, "grad_norm": 3.807181278001569e-10, "kl": 0.0684814453125, "learning_rate": 5.8088538450199464e-08, "loss": 0.0027, "num_tokens": 3001591165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9695314500298712, "frac_reward_zero_std": 1.0, "grad_norm": 4.622890065417867e-10, "kl": 0.0682373046875, "learning_rate": 5.744906283185469e-08, "loss": 0.0027, "num_tokens": 3002159581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9697021421865665, "frac_reward_zero_std": 1.0, "grad_norm": 3.546763082355208e-10, "kl": 0.076416015625, "learning_rate": 5.6813116460185944e-08, "loss": 0.003, "num_tokens": 3002729005.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9698728343432619, "frac_reward_zero_std": 1.0, "grad_norm": 5.272812131278525e-10, "kl": 0.0682373046875, "learning_rate": 5.618069956093375e-08, "loss": 0.0027, "num_tokens": 3003294749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9700435264999573, "frac_reward_zero_std": 1.0, "grad_norm": 4.860232432939845e-10, "kl": 0.07080078125, "learning_rate": 5.555181235858187e-08, "loss": 0.0028, "num_tokens": 3003868637.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9702142186566527, "frac_reward_zero_std": 1.0, "grad_norm": 5.095842851461395e-10, "kl": 0.068603515625, "learning_rate": 5.492645507636174e-08, "loss": 0.0027, "num_tokens": 3004437677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9703849108133481, "frac_reward_zero_std": 1.0, "grad_norm": 8.613215224273891e-10, "kl": 0.0716552734375, "learning_rate": 5.4304627936254685e-08, "loss": 0.0029, "num_tokens": 3004997965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9705556029700435, "frac_reward_zero_std": 1.0, "grad_norm": 4.965278059345131e-10, "kl": 0.070068359375, "learning_rate": 5.368633115898414e-08, "loss": 0.0028, "num_tokens": 3005567997.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9707262951267389, "frac_reward_zero_std": 1.0, "grad_norm": 5.278992977356225e-10, "kl": 0.075927734375, "learning_rate": 5.3071564964026766e-08, "loss": 0.003, "num_tokens": 3006135741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9708969872834343, "frac_reward_zero_std": 1.0, "grad_norm": 5.208385264317316e-10, "kl": 0.0673828125, "learning_rate": 5.246032956959912e-08, "loss": 0.0027, "num_tokens": 3006702333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9710676794401297, "frac_reward_zero_std": 1.0, "grad_norm": 4.851588711101755e-10, "kl": 0.0706787109375, "learning_rate": 5.185262519266876e-08, "loss": 0.0028, "num_tokens": 3007269197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9712383715968251, "frac_reward_zero_std": 1.0, "grad_norm": 4.806182325144e-10, "kl": 0.070068359375, "learning_rate": 5.124845204894868e-08, "loss": 0.0028, "num_tokens": 3007838269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9714090637535205, "frac_reward_zero_std": 1.0, "grad_norm": 5.350051753468484e-10, "kl": 0.065673828125, "learning_rate": 5.0647810352899565e-08, "loss": 0.0026, "num_tokens": 3008404237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.971579755910216, "frac_reward_zero_std": 1.0, "grad_norm": 3.3964955363647563e-10, "kl": 0.0723876953125, "learning_rate": 5.005070031772752e-08, "loss": 0.0029, "num_tokens": 3008970749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9717504480669114, "frac_reward_zero_std": 1.0, "grad_norm": 5.509217283054181e-10, "kl": 0.076171875, "learning_rate": 4.945712215538301e-08, "loss": 0.003, "num_tokens": 3009537405.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9719211402236068, "frac_reward_zero_std": 1.0, "grad_norm": 6.696841165105091e-10, "kl": 0.0738525390625, "learning_rate": 4.886707607656638e-08, "loss": 0.003, "num_tokens": 3010103229.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9720918323803022, "frac_reward_zero_std": 1.0, "grad_norm": 4.158426822147494e-10, "kl": 0.0693359375, "learning_rate": 4.828056229072231e-08, "loss": 0.0028, "num_tokens": 3010670909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9722625245369976, "frac_reward_zero_std": 1.0, "grad_norm": 4.738494493132031e-10, "kl": 0.0667724609375, "learning_rate": 4.769758100604316e-08, "loss": 0.0027, "num_tokens": 3011239789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9724332166936929, "frac_reward_zero_std": 1.0, "grad_norm": 4.953788847947812e-10, "kl": 0.0704345703125, "learning_rate": 4.7118132429464506e-08, "loss": 0.0028, "num_tokens": 3011806349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9726039088503883, "frac_reward_zero_std": 1.0, "grad_norm": 3.9229428375746254e-10, "kl": 0.0653076171875, "learning_rate": 4.654221676666959e-08, "loss": 0.0026, "num_tokens": 3012382253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9727746010070837, "frac_reward_zero_std": 1.0, "grad_norm": 4.352517955595786e-10, "kl": 0.068603515625, "learning_rate": 4.596983422209045e-08, "loss": 0.0027, "num_tokens": 3012948333.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9729452931637791, "frac_reward_zero_std": 1.0, "grad_norm": 5.484187673558298e-10, "kl": 0.071044921875, "learning_rate": 4.5400984998899e-08, "loss": 0.0028, "num_tokens": 3013514253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9731159853204745, "frac_reward_zero_std": 1.0, "grad_norm": 5.611526012344213e-10, "kl": 0.07080078125, "learning_rate": 4.483566929901817e-08, "loss": 0.0028, "num_tokens": 3014082061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9732866774771699, "frac_reward_zero_std": 1.0, "grad_norm": 6.211333790854561e-10, "kl": 0.06884765625, "learning_rate": 4.4273887323113e-08, "loss": 0.0028, "num_tokens": 3014669373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9734573696338653, "frac_reward_zero_std": 1.0, "grad_norm": 3.868876683433404e-10, "kl": 0.0736083984375, "learning_rate": 4.37156392705973e-08, "loss": 0.0029, "num_tokens": 3015248765.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9736280617905607, "frac_reward_zero_std": 1.0, "grad_norm": 7.082953744358992e-10, "kl": 0.0748291015625, "learning_rate": 4.3160925339629233e-08, "loss": 0.003, "num_tokens": 3015812109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9737987539472561, "frac_reward_zero_std": 1.0, "grad_norm": 4.958488639232896e-10, "kl": 0.072509765625, "learning_rate": 4.2609745727110185e-08, "loss": 0.0029, "num_tokens": 3016378973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9739694461039515, "frac_reward_zero_std": 1.0, "grad_norm": 4.257643423517481e-10, "kl": 0.0675048828125, "learning_rate": 4.2062100628691427e-08, "loss": 0.0027, "num_tokens": 3016947661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9741401382606469, "frac_reward_zero_std": 1.0, "grad_norm": 3.8279840881757753e-10, "kl": 0.0677490234375, "learning_rate": 4.151799023876524e-08, "loss": 0.0027, "num_tokens": 3017522733.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9743108304173423, "frac_reward_zero_std": 1.0, "grad_norm": 6.517598178706768e-10, "kl": 0.070068359375, "learning_rate": 4.0977414750471565e-08, "loss": 0.0028, "num_tokens": 3018084493.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9744815225740378, "frac_reward_zero_std": 1.0, "grad_norm": 4.958846534959708e-10, "kl": 0.069580078125, "learning_rate": 4.044037435569692e-08, "loss": 0.0028, "num_tokens": 3018650237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9746522147307332, "frac_reward_zero_std": 1.0, "grad_norm": 6.395709815132609e-10, "kl": 0.07275390625, "learning_rate": 3.9906869245068816e-08, "loss": 0.0029, "num_tokens": 3019215709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9748229068874286, "frac_reward_zero_std": 1.0, "grad_norm": 6.4268408493106e-10, "kl": 0.0701904296875, "learning_rate": 3.9376899607963536e-08, "loss": 0.0028, "num_tokens": 3019779309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.974993599044124, "frac_reward_zero_std": 1.0, "grad_norm": 5.26128561333123e-10, "kl": 0.067626953125, "learning_rate": 3.885046563250061e-08, "loss": 0.0027, "num_tokens": 3020345789.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9751642912008194, "frac_reward_zero_std": 1.0, "grad_norm": 5.82772105260671e-10, "kl": 0.07421875, "learning_rate": 3.832756750554722e-08, "loss": 0.003, "num_tokens": 3020920621.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9753349833575147, "frac_reward_zero_std": 1.0, "grad_norm": 5.144347539537785e-10, "kl": 0.0699462890625, "learning_rate": 3.7808205412710463e-08, "loss": 0.0028, "num_tokens": 3021489085.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9755056755142101, "frac_reward_zero_std": 1.0, "grad_norm": 5.252885373752425e-10, "kl": 0.0704345703125, "learning_rate": 3.729237953834619e-08, "loss": 0.0028, "num_tokens": 3022054237.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9756763676709055, "frac_reward_zero_std": 1.0, "grad_norm": 6.282499085363627e-10, "kl": 0.067626953125, "learning_rate": 3.678009006555461e-08, "loss": 0.0027, "num_tokens": 3022619245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9758470598276009, "frac_reward_zero_std": 1.0, "grad_norm": 4.108436512403922e-10, "kl": 0.0687255859375, "learning_rate": 3.6271337176179146e-08, "loss": 0.0028, "num_tokens": 3023190221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9760177519842963, "frac_reward_zero_std": 1.0, "grad_norm": 4.4336614340232027e-10, "kl": 0.068359375, "learning_rate": 3.5766121050808675e-08, "loss": 0.0027, "num_tokens": 3023757469.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9761884441409917, "frac_reward_zero_std": 1.0, "grad_norm": 4.4522189615428883e-10, "kl": 0.07080078125, "learning_rate": 3.526444186877753e-08, "loss": 0.0028, "num_tokens": 3024324525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9763591362976871, "frac_reward_zero_std": 1.0, "grad_norm": 6.514158459104742e-10, "kl": 0.0704345703125, "learning_rate": 3.476629980816326e-08, "loss": 0.0028, "num_tokens": 3024886605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9765298284543825, "frac_reward_zero_std": 1.0, "grad_norm": 6.816456297464534e-10, "kl": 0.0709228515625, "learning_rate": 3.427169504578776e-08, "loss": 0.0028, "num_tokens": 3025447101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9767005206110779, "frac_reward_zero_std": 1.0, "grad_norm": 5.668476064812718e-10, "kl": 0.068603515625, "learning_rate": 3.378062775721946e-08, "loss": 0.0027, "num_tokens": 3026016141.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9768712127677733, "frac_reward_zero_std": 1.0, "grad_norm": 5.451480579422411e-10, "kl": 0.0704345703125, "learning_rate": 3.329309811676784e-08, "loss": 0.0028, "num_tokens": 3026583613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9770419049244687, "frac_reward_zero_std": 1.0, "grad_norm": 6.636483088659375e-10, "kl": 0.0723876953125, "learning_rate": 3.28091062974889e-08, "loss": 0.0029, "num_tokens": 3027151485.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9772125970811641, "frac_reward_zero_std": 1.0, "grad_norm": 5.825978061611219e-10, "kl": 0.073974609375, "learning_rate": 3.232865247118189e-08, "loss": 0.003, "num_tokens": 3027715613.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9773832892378596, "frac_reward_zero_std": 1.0, "grad_norm": 5.464297289146152e-10, "kl": 0.07373046875, "learning_rate": 3.1851736808391484e-08, "loss": 0.003, "num_tokens": 3028283149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.977553981394555, "frac_reward_zero_std": 1.0, "grad_norm": 4.812951114736291e-10, "kl": 0.0699462890625, "learning_rate": 3.137835947840451e-08, "loss": 0.0028, "num_tokens": 3028849021.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9777246735512504, "frac_reward_zero_std": 1.0, "grad_norm": 4.761374301621112e-10, "kl": 0.0682373046875, "learning_rate": 3.0908520649254316e-08, "loss": 0.0027, "num_tokens": 3029415597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9778953657079458, "frac_reward_zero_std": 1.0, "grad_norm": 9.432518540812432e-10, "kl": 0.0731201171875, "learning_rate": 3.044222048771417e-08, "loss": 0.0029, "num_tokens": 3029974669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9780660578646411, "frac_reward_zero_std": 1.0, "grad_norm": 5.781684025981895e-10, "kl": 0.066650390625, "learning_rate": 2.997945915930722e-08, "loss": 0.0027, "num_tokens": 3030539389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9782367500213365, "frac_reward_zero_std": 1.0, "grad_norm": 0.009668312794421926, "kl": 0.0924072265625, "learning_rate": 2.9520236828294303e-08, "loss": 0.0037, "num_tokens": 3031131773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9784074421780319, "frac_reward_zero_std": 1.0, "grad_norm": 4.780999701297398e-10, "kl": 0.0694580078125, "learning_rate": 2.9064553657683902e-08, "loss": 0.0028, "num_tokens": 3031705069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9785781343347273, "frac_reward_zero_std": 1.0, "grad_norm": 5.269286235021274e-10, "kl": 0.069580078125, "learning_rate": 2.8612409809226637e-08, "loss": 0.0028, "num_tokens": 3032270669.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9787488264914227, "frac_reward_zero_std": 1.0, "grad_norm": 5.479861716563544e-10, "kl": 0.0679931640625, "learning_rate": 2.8163805443417457e-08, "loss": 0.0027, "num_tokens": 3032834685.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9789195186481181, "frac_reward_zero_std": 1.0, "grad_norm": 4.872547630721973e-10, "kl": 0.0711669921875, "learning_rate": 2.7718740719494543e-08, "loss": 0.0028, "num_tokens": 3033414925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9790902108048135, "frac_reward_zero_std": 1.0, "grad_norm": 5.880135201953789e-10, "kl": 0.071044921875, "learning_rate": 2.7277215795440404e-08, "loss": 0.0028, "num_tokens": 3033975965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9792609029615089, "frac_reward_zero_std": 1.0, "grad_norm": 7.561729365427357e-10, "kl": 0.0718994140625, "learning_rate": 2.683923082797968e-08, "loss": 0.0029, "num_tokens": 3034536925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9794315951182043, "frac_reward_zero_std": 1.0, "grad_norm": 4.4714895178777696e-10, "kl": 0.07275390625, "learning_rate": 2.6404785972581337e-08, "loss": 0.0029, "num_tokens": 3035103661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9796022872748997, "frac_reward_zero_std": 1.0, "grad_norm": 3.246075136874536e-10, "kl": 0.06689453125, "learning_rate": 2.5973881383458687e-08, "loss": 0.0027, "num_tokens": 3035678189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9797729794315951, "frac_reward_zero_std": 1.0, "grad_norm": 5.294330223989677e-10, "kl": 0.066650390625, "learning_rate": 2.5546517213566045e-08, "loss": 0.0027, "num_tokens": 3036245101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9799436715882905, "frac_reward_zero_std": 1.0, "grad_norm": 4.97944578641556e-10, "kl": 0.0706787109375, "learning_rate": 2.5122693614602068e-08, "loss": 0.0028, "num_tokens": 3036811165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.980114363744986, "frac_reward_zero_std": 1.0, "grad_norm": 4.1824178340169477e-10, "kl": 0.068115234375, "learning_rate": 2.4702410737010852e-08, "loss": 0.0027, "num_tokens": 3037378557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9802850559016814, "frac_reward_zero_std": 1.0, "grad_norm": 5.692868819708181e-10, "kl": 0.0699462890625, "learning_rate": 2.4285668729975287e-08, "loss": 0.0028, "num_tokens": 3037945965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9804557480583768, "frac_reward_zero_std": 1.0, "grad_norm": 6.368701335249955e-10, "kl": 0.0733642578125, "learning_rate": 2.387246774142482e-08, "loss": 0.0029, "num_tokens": 3038509341.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9806264402150722, "frac_reward_zero_std": 1.0, "grad_norm": 5.948544581137628e-10, "kl": 0.0699462890625, "learning_rate": 2.3462807918031015e-08, "loss": 0.0028, "num_tokens": 3039074445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9807971323717675, "frac_reward_zero_std": 1.0, "grad_norm": 6.84207008706564e-10, "kl": 0.0748291015625, "learning_rate": 2.3056689405207555e-08, "loss": 0.003, "num_tokens": 3039638349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9809678245284629, "frac_reward_zero_std": 1.0, "grad_norm": 7.299067453154242e-10, "kl": 0.0718994140625, "learning_rate": 2.2654112347113567e-08, "loss": 0.0029, "num_tokens": 3040205677.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9811385166851583, "frac_reward_zero_std": 1.0, "grad_norm": 7.134822208902136e-10, "kl": 0.0693359375, "learning_rate": 2.2255076886646965e-08, "loss": 0.0028, "num_tokens": 3040767101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9813092088418537, "frac_reward_zero_std": 1.0, "grad_norm": 5.482636096383058e-10, "kl": 0.0731201171875, "learning_rate": 2.1859583165453336e-08, "loss": 0.0029, "num_tokens": 3041337245.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9814799009985491, "frac_reward_zero_std": 1.0, "grad_norm": 5.753293679239256e-10, "kl": 0.0697021484375, "learning_rate": 2.146763132391594e-08, "loss": 0.0028, "num_tokens": 3041908573.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9816505931552445, "frac_reward_zero_std": 1.0, "grad_norm": 5.547981138515447e-10, "kl": 0.073974609375, "learning_rate": 2.1079221501166814e-08, "loss": 0.003, "num_tokens": 3042475805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9818212853119399, "frac_reward_zero_std": 1.0, "grad_norm": 6.960089598354633e-10, "kl": 0.074462890625, "learning_rate": 2.0694353835074566e-08, "loss": 0.003, "num_tokens": 3043036781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9819919774686353, "frac_reward_zero_std": 1.0, "grad_norm": 4.953351134843714e-10, "kl": 0.06982421875, "learning_rate": 2.0313028462254358e-08, "loss": 0.0028, "num_tokens": 3043605261.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9821626696253307, "frac_reward_zero_std": 1.0, "grad_norm": 7.399855940680616e-10, "kl": 0.0667724609375, "learning_rate": 1.9935245518063474e-08, "loss": 0.0027, "num_tokens": 3044167069.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9823333617820261, "frac_reward_zero_std": 1.0, "grad_norm": 5.921845037139128e-10, "kl": 0.06689453125, "learning_rate": 1.9561005136601308e-08, "loss": 0.0027, "num_tokens": 3044730509.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9825040539387215, "frac_reward_zero_std": 1.0, "grad_norm": 4.3835631238789975e-10, "kl": 0.0670166015625, "learning_rate": 1.9190307450708266e-08, "loss": 0.0027, "num_tokens": 3045297533.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9826747460954169, "frac_reward_zero_std": 1.0, "grad_norm": 5.211454530269488e-10, "kl": 0.069091796875, "learning_rate": 1.8823152591970205e-08, "loss": 0.0028, "num_tokens": 3045862477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9828454382521123, "frac_reward_zero_std": 1.0, "grad_norm": 6.262551464763707e-10, "kl": 0.0711669921875, "learning_rate": 1.8459540690712875e-08, "loss": 0.0028, "num_tokens": 3046429373.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9830161304088078, "frac_reward_zero_std": 1.0, "grad_norm": 6.753535520340682e-10, "kl": 0.071533203125, "learning_rate": 1.809947187600525e-08, "loss": 0.0029, "num_tokens": 3046988973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9831868225655032, "frac_reward_zero_std": 1.0, "grad_norm": 5.838109593121494e-10, "kl": 0.0714111328125, "learning_rate": 1.7742946275659535e-08, "loss": 0.0029, "num_tokens": 3047551901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9833575147221986, "frac_reward_zero_std": 1.0, "grad_norm": 6.223257330890978e-10, "kl": 0.072998046875, "learning_rate": 1.7389964016228943e-08, "loss": 0.0029, "num_tokens": 3048118477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9835282068788939, "frac_reward_zero_std": 1.0, "grad_norm": 3.009345733607128e-10, "kl": 0.0704345703125, "learning_rate": 1.704052522300992e-08, "loss": 0.0028, "num_tokens": 3048688253.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9836988990355893, "frac_reward_zero_std": 1.0, "grad_norm": 7.283457900751453e-10, "kl": 0.0745849609375, "learning_rate": 1.6694630020041014e-08, "loss": 0.003, "num_tokens": 3049251293.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9838695911922847, "frac_reward_zero_std": 1.0, "grad_norm": 5.587568184977195e-10, "kl": 0.0718994140625, "learning_rate": 1.6352278530100683e-08, "loss": 0.0029, "num_tokens": 3049816349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9840402833489801, "frac_reward_zero_std": 1.0, "grad_norm": 6.159592944706215e-10, "kl": 0.07080078125, "learning_rate": 1.6013470874712833e-08, "loss": 0.0028, "num_tokens": 3050379773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9842109755056755, "frac_reward_zero_std": 1.0, "grad_norm": 4.064164011664089e-10, "kl": 0.069091796875, "learning_rate": 1.567820717414126e-08, "loss": 0.0028, "num_tokens": 3050952925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9843816676623709, "frac_reward_zero_std": 1.0, "grad_norm": 4.890091675555032e-10, "kl": 0.0753173828125, "learning_rate": 1.5346487547392984e-08, "loss": 0.003, "num_tokens": 3051519965.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9845523598190663, "frac_reward_zero_std": 1.0, "grad_norm": 4.6221541854538666e-10, "kl": 0.0709228515625, "learning_rate": 1.501831211221716e-08, "loss": 0.0028, "num_tokens": 3052093149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9847230519757617, "frac_reward_zero_std": 1.0, "grad_norm": 3.543667007337604e-10, "kl": 0.06689453125, "learning_rate": 1.4693680985102821e-08, "loss": 0.0027, "num_tokens": 3052663773.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9848937441324571, "frac_reward_zero_std": 1.0, "grad_norm": 4.691750668202472e-10, "kl": 0.0726318359375, "learning_rate": 1.4372594281282237e-08, "loss": 0.0029, "num_tokens": 3053230445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9850644362891525, "frac_reward_zero_std": 1.0, "grad_norm": 4.863179885602617e-10, "kl": 0.0699462890625, "learning_rate": 1.4055052114730905e-08, "loss": 0.0028, "num_tokens": 3053799213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9852351284458479, "frac_reward_zero_std": 1.0, "grad_norm": 6.454723566889499e-10, "kl": 0.0701904296875, "learning_rate": 1.3741054598164216e-08, "loss": 0.0028, "num_tokens": 3054365165.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9854058206025433, "frac_reward_zero_std": 1.0, "grad_norm": 3.98754942945531e-10, "kl": 0.0732421875, "learning_rate": 1.3430601843039681e-08, "loss": 0.0029, "num_tokens": 3054932445.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9855765127592387, "frac_reward_zero_std": 1.0, "grad_norm": 6.105145419984318e-10, "kl": 0.0706787109375, "learning_rate": 1.3123693959556927e-08, "loss": 0.0028, "num_tokens": 3055494461.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9857472049159341, "frac_reward_zero_std": 1.0, "grad_norm": 4.0865928924817296e-10, "kl": 0.070556640625, "learning_rate": 1.2820331056657698e-08, "loss": 0.0028, "num_tokens": 3056062077.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9859178970726296, "frac_reward_zero_std": 1.0, "grad_norm": 7.224785689275984e-10, "kl": 0.0706787109375, "learning_rate": 1.2520513242023636e-08, "loss": 0.0028, "num_tokens": 3056625901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.986088589229325, "frac_reward_zero_std": 1.0, "grad_norm": 3.9706887039851487e-10, "kl": 0.0670166015625, "learning_rate": 1.222424062208072e-08, "loss": 0.0027, "num_tokens": 3057194909.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9862592813860203, "frac_reward_zero_std": 1.0, "grad_norm": 5.548515877554615e-10, "kl": 0.0689697265625, "learning_rate": 1.1931513301994823e-08, "loss": 0.0028, "num_tokens": 3057762781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9864299735427157, "frac_reward_zero_std": 1.0, "grad_norm": 6.10576126878506e-10, "kl": 0.0679931640625, "learning_rate": 1.1642331385672834e-08, "loss": 0.0027, "num_tokens": 3058326589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9866006656994111, "frac_reward_zero_std": 1.0, "grad_norm": 5.52494068640634e-10, "kl": 0.069091796875, "learning_rate": 1.135669497576375e-08, "loss": 0.0028, "num_tokens": 3058892781.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9867713578561065, "frac_reward_zero_std": 1.0, "grad_norm": 4.72800195400435e-10, "kl": 0.0731201171875, "learning_rate": 1.1074604173658688e-08, "loss": 0.0029, "num_tokens": 3059461277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9869420500128019, "frac_reward_zero_std": 1.0, "grad_norm": 6.07163917786634e-10, "kl": 0.0711669921875, "learning_rate": 1.0796059079489774e-08, "loss": 0.0028, "num_tokens": 3060024941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9871127421694973, "frac_reward_zero_std": 1.0, "grad_norm": 6.052747068788022e-10, "kl": 0.073486328125, "learning_rate": 1.0521059792130139e-08, "loss": 0.0029, "num_tokens": 3060590797.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9872834343261927, "frac_reward_zero_std": 1.0, "grad_norm": 3.9191879735185655e-10, "kl": 0.0689697265625, "learning_rate": 1.0249606409195035e-08, "loss": 0.0028, "num_tokens": 3061157581.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9874541264828881, "frac_reward_zero_std": 1.0, "grad_norm": 5.487112407705453e-10, "kl": 0.069091796875, "learning_rate": 9.981699027040714e-09, "loss": 0.0028, "num_tokens": 3061723357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9876248186395835, "frac_reward_zero_std": 1.0, "grad_norm": 3.436142314519597e-10, "kl": 0.06640625, "learning_rate": 9.717337740763334e-09, "loss": 0.0026, "num_tokens": 3062294941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9877955107962789, "frac_reward_zero_std": 1.0, "grad_norm": 7.25245143175018e-10, "kl": 0.0672607421875, "learning_rate": 9.45652264420338e-09, "loss": 0.0027, "num_tokens": 3062860205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9879662029529743, "frac_reward_zero_std": 1.0, "grad_norm": 5.868145136528995e-10, "kl": 0.0679931640625, "learning_rate": 9.199253829940136e-09, "loss": 0.0027, "num_tokens": 3063424925.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9881368951096697, "frac_reward_zero_std": 1.0, "grad_norm": 6.976633379808e-10, "kl": 0.071533203125, "learning_rate": 8.945531389293881e-09, "loss": 0.0029, "num_tokens": 3063990221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9883075872663651, "frac_reward_zero_std": 1.0, "grad_norm": 5.448219470205088e-10, "kl": 0.0660400390625, "learning_rate": 8.695355412328133e-09, "loss": 0.0026, "num_tokens": 3064558205.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9884782794230605, "frac_reward_zero_std": 1.0, "grad_norm": 7.512613084187635e-10, "kl": 0.0731201171875, "learning_rate": 8.448725987846295e-09, "loss": 0.0029, "num_tokens": 3065120013.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.988648971579756, "frac_reward_zero_std": 1.0, "grad_norm": 6.237476232448255e-10, "kl": 0.0701904296875, "learning_rate": 8.205643203391678e-09, "loss": 0.0028, "num_tokens": 3065682301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9888196637364514, "frac_reward_zero_std": 1.0, "grad_norm": 7.62334484627759e-10, "kl": 0.0819091796875, "learning_rate": 7.966107145249702e-09, "loss": 0.0033, "num_tokens": 3066245389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9889903558931467, "frac_reward_zero_std": 1.0, "grad_norm": 5.360576834937497e-10, "kl": 0.0677490234375, "learning_rate": 7.73011789845013e-09, "loss": 0.0027, "num_tokens": 3066809901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9891610480498421, "frac_reward_zero_std": 1.0, "grad_norm": 3.981127723687355e-10, "kl": 0.072509765625, "learning_rate": 7.497675546757067e-09, "loss": 0.0029, "num_tokens": 3067381741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9893317402065375, "frac_reward_zero_std": 1.0, "grad_norm": 3.637608001652711e-10, "kl": 0.0655517578125, "learning_rate": 7.268780172681178e-09, "loss": 0.0026, "num_tokens": 3067952301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9895024323632329, "frac_reward_zero_std": 1.0, "grad_norm": 8.943055532546161e-10, "kl": 0.072021484375, "learning_rate": 7.043431857470806e-09, "loss": 0.0029, "num_tokens": 3068511437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9896731245199283, "frac_reward_zero_std": 1.0, "grad_norm": 4.983664913728123e-10, "kl": 0.0709228515625, "learning_rate": 6.821630681117519e-09, "loss": 0.0028, "num_tokens": 3069077117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9898438166766237, "frac_reward_zero_std": 1.0, "grad_norm": 4.921258605611298e-10, "kl": 0.069091796875, "learning_rate": 6.603376722352783e-09, "loss": 0.0028, "num_tokens": 3069644749.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9900145088333191, "frac_reward_zero_std": 1.0, "grad_norm": 7.035454268261719e-10, "kl": 0.071044921875, "learning_rate": 6.388670058647961e-09, "loss": 0.0028, "num_tokens": 3070205389.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9901852009900145, "frac_reward_zero_std": 1.0, "grad_norm": 5.168976777275586e-10, "kl": 0.0714111328125, "learning_rate": 6.177510766216532e-09, "loss": 0.0029, "num_tokens": 3070771565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9903558931467099, "frac_reward_zero_std": 1.0, "grad_norm": 4.789435100643496e-10, "kl": 0.074462890625, "learning_rate": 5.969898920011874e-09, "loss": 0.003, "num_tokens": 3071340349.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9905265853034053, "frac_reward_zero_std": 1.0, "grad_norm": 4.70924925151569e-10, "kl": 0.066650390625, "learning_rate": 5.76583459373059e-09, "loss": 0.0027, "num_tokens": 3071910061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9906972774601007, "frac_reward_zero_std": 1.0, "grad_norm": 6.248909586564804e-10, "kl": 0.069091796875, "learning_rate": 5.565317859805852e-09, "loss": 0.0028, "num_tokens": 3072472605.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9908679696167961, "frac_reward_zero_std": 1.0, "grad_norm": 4.922727938622649e-10, "kl": 0.0738525390625, "learning_rate": 5.368348789415168e-09, "loss": 0.003, "num_tokens": 3073050941.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9910386617734915, "frac_reward_zero_std": 1.0, "grad_norm": 6.93799436324945e-10, "kl": 0.0721435546875, "learning_rate": 5.174927452475942e-09, "loss": 0.0029, "num_tokens": 3073615101.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9912093539301869, "frac_reward_zero_std": 1.0, "grad_norm": 5.157309747780951e-10, "kl": 0.072998046875, "learning_rate": 4.9850539176443666e-09, "loss": 0.0029, "num_tokens": 3074185837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9913800460868823, "frac_reward_zero_std": 1.0, "grad_norm": 5.663127836819658e-10, "kl": 0.0728759765625, "learning_rate": 4.798728252318752e-09, "loss": 0.0029, "num_tokens": 3074748589.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9915507382435778, "frac_reward_zero_std": 1.0, "grad_norm": 5.81060011076877e-10, "kl": 0.071533203125, "learning_rate": 4.615950522639523e-09, "loss": 0.0029, "num_tokens": 3075312301.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9917214304002732, "frac_reward_zero_std": 1.0, "grad_norm": 6.779969170179169e-10, "kl": 0.0692138671875, "learning_rate": 4.436720793484784e-09, "loss": 0.0028, "num_tokens": 3075871741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9918921225569685, "frac_reward_zero_std": 1.0, "grad_norm": 5.085064209109425e-10, "kl": 0.06884765625, "learning_rate": 4.261039128474753e-09, "loss": 0.0028, "num_tokens": 3076438109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9920628147136639, "frac_reward_zero_std": 1.0, "grad_norm": 3.811144711675134e-10, "kl": 0.0679931640625, "learning_rate": 4.08890558997177e-09, "loss": 0.0027, "num_tokens": 3077005661.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9922335068703593, "frac_reward_zero_std": 1.0, "grad_norm": 4.677117252483234e-10, "kl": 0.069091796875, "learning_rate": 3.9203202390747375e-09, "loss": 0.0028, "num_tokens": 3077572557.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9924041990270547, "frac_reward_zero_std": 1.0, "grad_norm": 5.772742720322899e-10, "kl": 0.070556640625, "learning_rate": 3.755283135625787e-09, "loss": 0.0028, "num_tokens": 3078138109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9925748911837501, "frac_reward_zero_std": 1.0, "grad_norm": 7.956424895383914e-10, "kl": 0.0771484375, "learning_rate": 3.5937943382080565e-09, "loss": 0.0031, "num_tokens": 3078704221.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9927455833404455, "frac_reward_zero_std": 1.0, "grad_norm": 6.742165831757891e-10, "kl": 0.072265625, "learning_rate": 3.4358539041434712e-09, "loss": 0.0029, "num_tokens": 3079266477.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9929162754971409, "frac_reward_zero_std": 1.0, "grad_norm": 3.7852213706779845e-10, "kl": 0.0684814453125, "learning_rate": 3.2814618894960736e-09, "loss": 0.0027, "num_tokens": 3079834173.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9930869676538363, "frac_reward_zero_std": 1.0, "grad_norm": 5.561607162981613e-10, "kl": 0.0723876953125, "learning_rate": 3.1306183490686926e-09, "loss": 0.0029, "num_tokens": 3080401197.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9932576598105317, "frac_reward_zero_std": 1.0, "grad_norm": 3.4906147151699074e-10, "kl": 0.065185546875, "learning_rate": 2.983323336405164e-09, "loss": 0.0026, "num_tokens": 3080973741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9934283519672271, "frac_reward_zero_std": 1.0, "grad_norm": 6.185219104793831e-10, "kl": 0.071533203125, "learning_rate": 2.839576903790331e-09, "loss": 0.0029, "num_tokens": 3081541357.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9935990441239225, "frac_reward_zero_std": 1.0, "grad_norm": 5.508278603709342e-10, "kl": 0.0733642578125, "learning_rate": 2.6993791022489336e-09, "loss": 0.0029, "num_tokens": 3082113053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9937697362806179, "frac_reward_zero_std": 1.0, "grad_norm": 5.145192513810109e-10, "kl": 0.0723876953125, "learning_rate": 2.5627299815467188e-09, "loss": 0.0029, "num_tokens": 3082679709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9939404284373133, "frac_reward_zero_std": 1.0, "grad_norm": 5.141495715782781e-10, "kl": 0.0704345703125, "learning_rate": 2.4296295901882204e-09, "loss": 0.0028, "num_tokens": 3083244525.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9941111205940087, "frac_reward_zero_std": 1.0, "grad_norm": 6.687168808766884e-10, "kl": 0.07421875, "learning_rate": 2.30007797542009e-09, "loss": 0.003, "num_tokens": 3083809117.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9942818127507042, "frac_reward_zero_std": 1.0, "grad_norm": 7.596999127373538e-10, "kl": 0.07177734375, "learning_rate": 2.1740751832266538e-09, "loss": 0.0029, "num_tokens": 3084368317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9944525049073996, "frac_reward_zero_std": 1.0, "grad_norm": 6.270419802597472e-10, "kl": 0.0716552734375, "learning_rate": 2.051621258337688e-09, "loss": 0.0029, "num_tokens": 3084933901.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9946231970640949, "frac_reward_zero_std": 1.0, "grad_norm": 4.3778359069538296e-10, "kl": 0.0723876953125, "learning_rate": 1.932716244216204e-09, "loss": 0.0029, "num_tokens": 3085502653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9947938892207903, "frac_reward_zero_std": 1.0, "grad_norm": 5.469255598845655e-10, "kl": 0.070556640625, "learning_rate": 1.8173601830717701e-09, "loss": 0.0028, "num_tokens": 3086065501.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9949645813774857, "frac_reward_zero_std": 1.0, "grad_norm": 5.771935887648311e-10, "kl": 0.0694580078125, "learning_rate": 1.7055531158505223e-09, "loss": 0.0028, "num_tokens": 3086628109.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9951352735341811, "frac_reward_zero_std": 1.0, "grad_norm": 4.6547314922916824e-10, "kl": 0.072265625, "learning_rate": 1.597295082240713e-09, "loss": 0.0029, "num_tokens": 3087191709.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9953059656908765, "frac_reward_zero_std": 1.0, "grad_norm": 2.592292145862876e-10, "kl": 0.0716552734375, "learning_rate": 1.4925861206693814e-09, "loss": 0.0029, "num_tokens": 3087761549.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9954766578475719, "frac_reward_zero_std": 1.0, "grad_norm": 5.404237530683147e-10, "kl": 0.0699462890625, "learning_rate": 1.3914262683045744e-09, "loss": 0.0028, "num_tokens": 3088331437.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9956473500042673, "frac_reward_zero_std": 1.0, "grad_norm": 4.175603184202771e-10, "kl": 0.0677490234375, "learning_rate": 1.2938155610542347e-09, "loss": 0.0027, "num_tokens": 3088899597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9958180421609627, "frac_reward_zero_std": 1.0, "grad_norm": 6.546741004252111e-10, "kl": 0.0689697265625, "learning_rate": 1.199754033567313e-09, "loss": 0.0028, "num_tokens": 3089462061.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9959887343176581, "frac_reward_zero_std": 1.0, "grad_norm": 6.408137843595976e-10, "kl": 0.0721435546875, "learning_rate": 1.109241719230436e-09, "loss": 0.0029, "num_tokens": 3090024973.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9961594264743535, "frac_reward_zero_std": 1.0, "grad_norm": 7.314988018764952e-10, "kl": 0.074951171875, "learning_rate": 1.0222786501745685e-09, "loss": 0.003, "num_tokens": 3090587837.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9963301186310489, "frac_reward_zero_std": 1.0, "grad_norm": 4.460871334125973e-10, "kl": 0.0712890625, "learning_rate": 9.388648572672409e-10, "loss": 0.0029, "num_tokens": 3091153053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9965008107877443, "frac_reward_zero_std": 1.0, "grad_norm": 6.012484838678063e-10, "kl": 0.0718994140625, "learning_rate": 8.590003701181016e-10, "loss": 0.0029, "num_tokens": 3091718701.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9966715029444397, "frac_reward_zero_std": 1.0, "grad_norm": 4.372691047723987e-10, "kl": 0.0704345703125, "learning_rate": 7.826852170744748e-10, "loss": 0.0028, "num_tokens": 3092295149.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9968421951011351, "frac_reward_zero_std": 1.0, "grad_norm": 4.1318780338634825e-10, "kl": 0.07080078125, "learning_rate": 7.099194252269126e-10, "loss": 0.0028, "num_tokens": 3092863565.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9970128872578305, "frac_reward_zero_std": 1.0, "grad_norm": 6.202494664145615e-10, "kl": 0.0706787109375, "learning_rate": 6.407030204047537e-10, "loss": 0.0028, "num_tokens": 3093428269.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.997183579414526, "frac_reward_zero_std": 1.0, "grad_norm": 5.145942607814559e-10, "kl": 0.06982421875, "learning_rate": 5.750360271761235e-10, "loss": 0.0028, "num_tokens": 3093993277.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9973542715712213, "frac_reward_zero_std": 1.0, "grad_norm": 4.815089975249606e-10, "kl": 0.0721435546875, "learning_rate": 5.129184688523747e-10, "loss": 0.0029, "num_tokens": 3094564189.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9975249637279167, "frac_reward_zero_std": 1.0, "grad_norm": 6.833911630906904e-10, "kl": 0.070556640625, "learning_rate": 4.543503674803162e-10, "loss": 0.0028, "num_tokens": 3095125309.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9976956558846121, "frac_reward_zero_std": 1.0, "grad_norm": 7.009986247892807e-10, "kl": 0.069580078125, "learning_rate": 3.993317438522049e-10, "loss": 0.0028, "num_tokens": 3095687757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9978663480413075, "frac_reward_zero_std": 1.0, "grad_norm": 6.477809195228889e-10, "kl": 0.0712890625, "learning_rate": 3.4786261749575336e-10, "loss": 0.0029, "num_tokens": 3096249741.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9980370401980029, "frac_reward_zero_std": 1.0, "grad_norm": 9.14725987496916e-10, "kl": 0.07373046875, "learning_rate": 2.99943006681902e-10, "loss": 0.003, "num_tokens": 3096811597.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9982077323546983, "frac_reward_zero_std": 1.0, "grad_norm": 6.108759261205938e-10, "kl": 0.0714111328125, "learning_rate": 2.555729284192676e-10, "loss": 0.0029, "num_tokens": 3097374717.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9983784245113937, "frac_reward_zero_std": 1.0, "grad_norm": 5.743820121218904e-10, "kl": 0.0718994140625, "learning_rate": 2.147523984585842e-10, "loss": 0.0029, "num_tokens": 3097939805.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9985491166680891, "frac_reward_zero_std": 1.0, "grad_norm": 5.414359469229852e-10, "kl": 0.0714111328125, "learning_rate": 1.7748143128937246e-10, "loss": 0.0029, "num_tokens": 3098508957.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9987198088247845, "frac_reward_zero_std": 1.0, "grad_norm": 6.168677684106501e-10, "kl": 0.071533203125, "learning_rate": 1.4376004014216017e-10, "loss": 0.0029, "num_tokens": 3099073821.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9988905009814799, "frac_reward_zero_std": 1.0, "grad_norm": 5.00527050853811e-10, "kl": 0.06787109375, "learning_rate": 1.1358823698404131e-10, "loss": 0.0027, "num_tokens": 3099637757.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9990611931381753, "frac_reward_zero_std": 1.0, "grad_norm": 4.529207300217758e-10, "kl": 0.0650634765625, "learning_rate": 8.696603252866808e-11, "loss": 0.0026, "num_tokens": 3100203885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9992318852948707, "frac_reward_zero_std": 1.0, "grad_norm": 6.505254899019415e-10, "kl": 0.072265625, "learning_rate": 6.389343622403844e-11, "loss": 0.0029, "num_tokens": 3100765885.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9994025774515661, "frac_reward_zero_std": 1.0, "grad_norm": 7.184588305303324e-10, "kl": 0.06884765625, "learning_rate": 4.437045625915737e-11, "loss": 0.0028, "num_tokens": 3101327053.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9995732696082615, "frac_reward_zero_std": 1.0, "grad_norm": 5.548119591819174e-10, "kl": 0.0714111328125, "learning_rate": 2.839709956625747e-11, "loss": 0.0029, "num_tokens": 3101890317.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.999743961764957, "frac_reward_zero_std": 1.0, "grad_norm": 4.3193996079624006e-10, "kl": 0.0662841796875, "learning_rate": 1.5973371813027273e-11, "loss": 0.0027, "num_tokens": 3102460397.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9999146539216524, "frac_reward_zero_std": 1.0, "grad_norm": 4.10613893427095e-10, "kl": 0.0687255859375, "learning_rate": 7.099277411493077e-12, "loss": 0.0028, "num_tokens": 3103026653.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 5858 }, { "epoch": 0.9999146539216524, "step": 5858, "total_flos": 0.0, "train_loss": 3.769922398664146e-06, "train_runtime": 505.2607, "train_samples_per_second": 185.514, "train_steps_per_second": 11.596 } ], "logging_steps": 1, "max_steps": 5859, "num_input_tokens_seen": 3103026653, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }